From b95fb6c3d66b7e8b40cfbb4e000e7e023263b2b9 Mon Sep 17 00:00:00 2001
From: yangxinyu <yangxinyu.302@bytedance.com>
Date: Sat, 18 May 2024 11:50:09 +0000
Subject: [PATCH 1/3] change pass order in affine-opt

---
 compiler/lib/Pipelines/AffineOpt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/lib/Pipelines/AffineOpt.cpp b/compiler/lib/Pipelines/AffineOpt.cpp
index e29492ccb..857747836 100644
--- a/compiler/lib/Pipelines/AffineOpt.cpp
+++ b/compiler/lib/Pipelines/AffineOpt.cpp
@@ -39,8 +39,8 @@ void addGenericAffineOptPasses(OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createLoopCoalescingPass());
   pm.addNestedPass<func::FuncOp>(createLoopFusionPass());
   pm.addNestedPass<func::FuncOp>(createSimplifyAffineStructuresPass());
-  pm.addPass(memref::createFoldMemRefAliasOpsPass());
   pm.addPass(createLowerAffinePass());
+  pm.addPass(memref::createFoldMemRefAliasOpsPass());
   pm.addPass(arith::createIntRangeOptimizationsPass());
   addCleanUpExtPassPipeline(pm);
 }

From 90f102c38934cf7290e884992bce528b128f3fd3 Mon Sep 17 00:00:00 2001
From: yangxinyu <yangxinyu.302@bytedance.com>
Date: Sat, 18 May 2024 11:51:20 +0000
Subject: [PATCH 2/3] fix SetSpaceOptPipeline in gen_testcases.py

---
 compiler/scripts/gen_testcases.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler/scripts/gen_testcases.py b/compiler/scripts/gen_testcases.py
index 18971330a..53a9551c5 100644
--- a/compiler/scripts/gen_testcases.py
+++ b/compiler/scripts/gen_testcases.py
@@ -138,6 +138,8 @@ def ByreTensorOptPipeline(filecheck, *, entryFunc="main"):
     def SetSpaceOptPipeline(filecheck, *, entryFunc="main"):
         return OptPipeline(E2ECollections.SetSpaceOpt, [E2ECollections.ByreOpt], [
             "-remove-func-body=\"anchor-attr=__byteir_elementwise_fusion__\"",
+            "-inline",
+            "-gpu-launch-func-to-byre",
             "-set-op-space=\"entry-func={} space=cuda\"".format(entryFunc),
             "-set-arg-space=\"entry-func={} all-space=cuda\"".format(entryFunc)
         ], filecheck)

From 577af8ebbe71f123cd721d47fea48107773c1ce2 Mon Sep 17 00:00:00 2001
From: yangxinyu <yangxinyu.302@bytedance.com>
Date: Sat, 18 May 2024 11:53:48 +0000
Subject: [PATCH 3/3] add python3 scripts/gen_testcases.py --top-dir=test/E2E
 --category=E2E result

---
 .../E2E/MLPInference/10b_ptx_codegen.mlir     |  203 +-
 .../E2E/MLPInference/2_linalg_tensor_opt.mlir |   31 +-
 .../E2E/MLPInference/3_byre_tensor_opt.mlir   |   88 +-
 .../E2E/MLPInference/4_bufferize_opt.mlir     |   90 +-
 .../test/E2E/MLPInference/5_affine_opt.mlir   |   84 +-
 .../MLPInference/5_alternative_scf_opt.mlir   |   84 +-
 compiler/test/E2E/MLPInference/6_gpu_opt.mlir |  105 +-
 .../E2E/MLPInference/7_set_space_opt.mlir     |  140 +-
 .../test/E2E/MLPInference/8_byre_opt.mlir     |  124 +-
 .../test/E2E/MLPInference/9a_byre_host.mlir   |  129 +-
 .../E2E/MLPInference/9b_nvvm_codegen.mlir     |  129 +-
 .../test/E2E/MLPInference/device_output.ptx   |  226 +-
 .../test/E2E/MLPInference/host_output.mlir    |   34 +-
 .../test/E2E/ResNet18/BW/10b_ptx_codegen.mlir | 3968 ++++------
 .../E2E/ResNet18/BW/2_linalg_tensor_opt.mlir  |  599 +-
 .../E2E/ResNet18/BW/3_byre_tensor_opt.mlir    | 1329 ++--
 .../test/E2E/ResNet18/BW/4_bufferize_opt.mlir |  954 ++-
 .../test/E2E/ResNet18/BW/5_affine_opt.mlir    |  806 +-
 .../ResNet18/BW/5_alternative_scf_opt.mlir    |  806 +-
 compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir  | 1648 +----
 .../test/E2E/ResNet18/BW/7_set_space_opt.mlir | 2161 ++----
 compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir | 1898 +----
 .../test/E2E/ResNet18/BW/9a_byre_host.mlir    | 1984 ++---
 .../test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir | 1984 ++---
 .../test/E2E/ResNet18/BW/device_output.ptx    | 4016 ++--------
 .../test/E2E/ResNet18/BW/host_output.mlir     |  216 +-
 .../test/E2E/ResNet18/FW/10b_ptx_codegen.mlir | 5517 ++++----------
 .../E2E/ResNet18/FW/2_linalg_tensor_opt.mlir  |  734 +-
 .../E2E/ResNet18/FW/3_byre_tensor_opt.mlir    | 1854 +++--
 .../test/E2E/ResNet18/FW/4_bufferize_opt.mlir | 1637 ++--
 .../test/E2E/ResNet18/FW/5_affine_opt.mlir    | 1445 ++--
 .../ResNet18/FW/5_alternative_scf_opt.mlir    | 1445 ++--
 compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir  | 2391 +-----
 .../test/E2E/ResNet18/FW/7_set_space_opt.mlir | 3642 ++-------
 compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir | 2858 ++-----
 .../test/E2E/ResNet18/FW/9a_byre_host.mlir    | 2903 ++------
 .../test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir | 2903 ++------
 .../test/E2E/ResNet18/FW/device_output.ptx    | 6317 +++-------------
 .../test/E2E/ResNet18/FW/host_output.mlir     |  272 +-
 .../ResNet18/Whole/2_linalg_tensor_opt.mlir   | 1172 +--
 .../E2E/ResNet18/Whole/3_byre_tensor_opt.mlir | 3681 +++++----
 .../E2E/ResNet18/Whole/4_bufferize_opt.mlir   | 3369 ++++++---
 .../test/E2E/ResNet18/Whole/5_affine_opt.mlir | 2874 +++++---
 .../ResNet18/Whole/5_alternative_scf_opt.mlir | 2874 +++++---
 .../test/E2E/ResNet18/Whole/6_gpu_opt.mlir    | 5041 ++++---------
 .../E2E/ResNet18/Whole/7_set_space_opt.mlir   | 6567 ++++++-----------
 .../test/E2E/ResNet18/Whole/8_byre_opt.mlir   | 5505 +++++---------
 .../test/E2E/ResNet18/Whole/9a_byre_host.mlir | 5691 +++++---------
 .../E2E/ResNet18/Whole/9b_nvvm_codegen.mlir   | 5698 +++++---------
 .../test/E2E/ResNet18/Whole/host_output.mlir  |  619 +-
 50 files changed, 34889 insertions(+), 65956 deletions(-)

diff --git a/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir b/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir
index 25d987adc..47adb94ce 100644
--- a/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir
+++ b/compiler/test/E2E/MLPInference/10b_ptx_codegen.mlir
@@ -4,7 +4,7 @@
 
 module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} {
   gpu.module @unified {
-    llvm.func @Unknown2(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown2(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -20,95 +20,46 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_modu
       %12 = llvm.mlir.constant(0 : index) : i64
       %13 = llvm.mlir.constant(20 : index) : i64
       %14 = llvm.mlir.constant(10 : index) : i64
-      %15 = llvm.mlir.constant(-1 : index) : i64
-      %16 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %17 = llvm.sext %16 : i32 to i64
-      %18 = nvvm.read.ptx.sreg.ntid.x : i32
-      %19 = llvm.sext %18 : i32 to i64
-      %20 = nvvm.read.ptx.sreg.tid.x : i32
-      %21 = llvm.sext %20 : i32 to i64
-      %22 = llvm.mul %19, %17  : i64
-      %23 = llvm.add %21, %22  : i64
-      %24 = llvm.icmp "slt" %23, %13 : i64
-      llvm.cond_br %24, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %25 = llvm.srem %23, %14  : i64
-      %26 = llvm.icmp "slt" %25, %12 : i64
-      %27 = llvm.add %25, %14  : i64
-      %28 = llvm.select %26, %27, %25 : i1, i64
-      %29 = llvm.icmp "slt" %23, %12 : i64
-      %30 = llvm.sub %15, %23  : i64
-      %31 = llvm.select %29, %30, %23 : i1, i64
-      %32 = llvm.sdiv %31, %14  : i64
-      %33 = llvm.sub %15, %32  : i64
-      %34 = llvm.select %29, %33, %32 : i1, i64
-      %35 = llvm.mul %34, %14  : i64
-      %36 = llvm.add %35, %28  : i64
-      %37 = llvm.getelementptr %arg6[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %38 = llvm.load %37 : !llvm.ptr -> f32
-      %39 = llvm.getelementptr %arg1[%28] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %40 = llvm.load %39 : !llvm.ptr -> f32
-      %41 = llvm.fadd %38, %40  : f32
-      %42 = llvm.getelementptr %arg13[%36] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %41, %42 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %12 = llvm.mlir.constant(0.000000e+00 : f32) : f32
-      %13 = llvm.mlir.constant(0 : index) : i64
-      %14 = llvm.mlir.constant(40 : index) : i64
-      %15 = llvm.mlir.constant(20 : index) : i64
-      %16 = llvm.mlir.constant(-1 : index) : i64
-      %17 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %15 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %16 = llvm.sext %15 : i32 to i64
+      %17 = nvvm.read.ptx.sreg.ntid.x : i32
       %18 = llvm.sext %17 : i32 to i64
-      %19 = nvvm.read.ptx.sreg.ntid.x : i32
+      %19 = nvvm.read.ptx.sreg.tid.x : i32
       %20 = llvm.sext %19 : i32 to i64
-      %21 = nvvm.read.ptx.sreg.tid.x : i32
-      %22 = llvm.sext %21 : i32 to i64
-      %23 = llvm.mul %20, %18  : i64
-      %24 = llvm.add %22, %23  : i64
-      %25 = llvm.icmp "slt" %24, %14 : i64
-      llvm.cond_br %25, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %26 = llvm.srem %24, %15  : i64
+      %21 = llvm.mul %18, %16  : i64
+      %22 = llvm.add %20, %21  : i64
+      %23 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %18, %24  : i64
+      llvm.br ^bb1(%22 : i64)
+    ^bb1(%26: i64):  // 2 preds: ^bb0, ^bb2
       %27 = llvm.icmp "slt" %26, %13 : i64
-      %28 = llvm.add %26, %15  : i64
-      %29 = llvm.select %27, %28, %26 : i1, i64
-      %30 = llvm.icmp "slt" %24, %13 : i64
-      %31 = llvm.sub %16, %24  : i64
-      %32 = llvm.select %30, %31, %24 : i1, i64
-      %33 = llvm.sdiv %32, %15  : i64
-      %34 = llvm.sub %16, %33  : i64
-      %35 = llvm.select %30, %34, %33 : i1, i64
-      %36 = llvm.mul %35, %15  : i64
-      %37 = llvm.add %36, %29  : i64
-      %38 = llvm.getelementptr %arg6[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.cond_br %27, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %28 = llvm.srem %26, %14  : i64
+      %29 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %30 = llvm.mlir.constant(1 : index) : i64
+      %31 = llvm.getelementptr %arg1[%28] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %32 = llvm.load %31 : !llvm.ptr -> f32
+      %33 = llvm.insertvalue %26, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %34 = llvm.insertvalue %30, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %35 = llvm.getelementptr %arg6[%26] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %36 = llvm.mul %12, %14  : i64
+      %37 = llvm.add %36, %12  : i64
+      %38 = llvm.getelementptr %35[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
       %39 = llvm.load %38 : !llvm.ptr -> f32
-      %40 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %41 = llvm.load %40 : !llvm.ptr -> f32
-      %42 = llvm.fadd %39, %41  : f32
-      %43 = llvm.intr.maxnum(%42, %12)  : (f32, f32) -> f32
-      %44 = llvm.getelementptr %arg13[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %43, %44 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %40 = llvm.fadd %39, %32  : f32
+      %41 = llvm.insertvalue %26, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %42 = llvm.insertvalue %30, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %43 = llvm.getelementptr %arg13[%26] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.getelementptr %43[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %40, %44 : f32, !llvm.ptr
+      %45 = llvm.add %26, %25  : i64
+      llvm.br ^bb1(%45 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -121,51 +72,55 @@ module attributes {byre.container_module, gpu.container_module, torch.debug_modu
       %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
       %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
       %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %12 = llvm.mlir.constant(0.000000e+00 : f32) : f32
-      %13 = llvm.mlir.constant(0 : index) : i64
-      %14 = llvm.mlir.constant(40 : index) : i64
+      %12 = llvm.mlir.constant(0 : index) : i64
+      %13 = llvm.mlir.constant(40 : index) : i64
+      %14 = llvm.mlir.constant(0.000000e+00 : f32) : f32
       %15 = llvm.mlir.constant(20 : index) : i64
-      %16 = llvm.mlir.constant(-1 : index) : i64
-      %17 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %18 = llvm.sext %17 : i32 to i64
-      %19 = nvvm.read.ptx.sreg.ntid.x : i32
-      %20 = llvm.sext %19 : i32 to i64
-      %21 = nvvm.read.ptx.sreg.tid.x : i32
-      %22 = llvm.sext %21 : i32 to i64
-      %23 = llvm.mul %20, %18  : i64
-      %24 = llvm.add %22, %23  : i64
-      %25 = llvm.icmp "slt" %24, %14 : i64
-      llvm.cond_br %25, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %26 = llvm.srem %24, %15  : i64
-      %27 = llvm.icmp "slt" %26, %13 : i64
-      %28 = llvm.add %26, %15  : i64
-      %29 = llvm.select %27, %28, %26 : i1, i64
-      %30 = llvm.icmp "slt" %24, %13 : i64
-      %31 = llvm.sub %16, %24  : i64
-      %32 = llvm.select %30, %31, %24 : i1, i64
-      %33 = llvm.sdiv %32, %15  : i64
-      %34 = llvm.sub %16, %33  : i64
-      %35 = llvm.select %30, %34, %33 : i1, i64
-      %36 = llvm.mul %35, %15  : i64
-      %37 = llvm.add %36, %29  : i64
-      %38 = llvm.getelementptr %arg6[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %39 = llvm.load %38 : !llvm.ptr -> f32
-      %40 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %41 = llvm.load %40 : !llvm.ptr -> f32
-      %42 = llvm.fadd %39, %41  : f32
-      %43 = llvm.intr.maxnum(%42, %12)  : (f32, f32) -> f32
-      %44 = llvm.getelementptr %arg13[%37] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %43, %44 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %16 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.sext %16 : i32 to i64
+      %18 = nvvm.read.ptx.sreg.ntid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = nvvm.read.ptx.sreg.tid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = llvm.mul %19, %17  : i64
+      %23 = llvm.add %21, %22  : i64
+      %24 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %25 = llvm.sext %24 : i32 to i64
+      %26 = llvm.mul %19, %25  : i64
+      llvm.br ^bb1(%23 : i64)
+    ^bb1(%27: i64):  // 2 preds: ^bb0, ^bb2
+      %28 = llvm.icmp "slt" %27, %13 : i64
+      llvm.cond_br %28, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %29 = llvm.srem %27, %15  : i64
+      %30 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %31 = llvm.mlir.constant(1 : index) : i64
+      %32 = llvm.getelementptr %arg1[%29] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %33 = llvm.load %32 : !llvm.ptr -> f32
+      %34 = llvm.insertvalue %27, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %35 = llvm.insertvalue %31, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %36 = llvm.getelementptr %arg6[%27] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %37 = llvm.mul %12, %15  : i64
+      %38 = llvm.add %37, %12  : i64
+      %39 = llvm.getelementptr %36[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %40 = llvm.load %39 : !llvm.ptr -> f32
+      %41 = llvm.fadd %40, %33  : f32
+      %42 = llvm.intr.maximum(%41, %14)  : (f32, f32) -> f32
+      %43 = llvm.insertvalue %27, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %44 = llvm.insertvalue %31, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %45 = llvm.getelementptr %arg13[%27] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %46 = llvm.getelementptr %45[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %42, %46 : f32, !llvm.ptr
+      %47 = llvm.add %27, %26  : i64
+      llvm.br ^bb1(%47 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
   }
-  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir b/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir
index df028cb88..473a2a1a6 100644
--- a/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir
+++ b/compiler/test/E2E/MLPInference/2_linalg_tensor_opt.mlir
@@ -10,31 +10,24 @@ module attributes {torch.debug_module_name = "GraphModule"} {
     %3 = mhlo.maximum %2, %0 : tensor<2x20xf32>
     return %3 : tensor<2x20xf32>
   }
-  func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<2x20xf32>
-    %1 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<20xf32>) -> tensor<2x20xf32>
-    %2 = mhlo.add %arg1, %1 : tensor<2x20xf32>
-    %3 = mhlo.maximum %2, %0 : tensor<2x20xf32>
-    return %3 : tensor<2x20xf32>
-  }
   func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<10xf32>) -> tensor<2x10xf32>
     %1 = mhlo.add %arg1, %0 : tensor<2x10xf32>
     return %1 : tensor<2x10xf32>
   }
   func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> {
-    %0 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
-    %1 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
-    %2 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
-    %3 = mhlo.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>
-    %4 = mhlo.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>
-    %5 = mhlo.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>
-    %6 = "mhlo.dot"(%arg0, %5) : (tensor<2x10xf32>, tensor<10x20xf32>) -> tensor<2x20xf32>
-    %7 = call @Unknown0(%0, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
-    %8 = "mhlo.dot"(%7, %4) : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32>
-    %9 = call @Unknown1(%1, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
-    %10 = "mhlo.dot"(%9, %3) : (tensor<2x20xf32>, tensor<20x10xf32>) -> tensor<2x10xf32>
-    %11 = call @Unknown2(%2, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
+    %0 = mhlo.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>
+    %1 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
+    %2 = mhlo.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>
+    %3 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
+    %4 = mhlo.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>
+    %5 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
+    %6 = "mhlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x10xf32>, tensor<20x10xf32>) -> tensor<2x20xf32>
+    %7 = call @Unknown0(%1, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %8 = "mhlo.dot_general"(%7, %2) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32>
+    %9 = call @Unknown0(%3, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %10 = "mhlo.dot_general"(%9, %4) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x20xf32>, tensor<10x20xf32>) -> tensor<2x10xf32>
+    %11 = call @Unknown2(%5, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
     return %11 : tensor<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir b/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir
index 0c7e3d79e..a4a2b2bdb 100644
--- a/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir
+++ b/compiler/test/E2E/MLPInference/3_byre_tensor_opt.mlir
@@ -2,53 +2,69 @@
 
 // CHECK-LABEL: func.func @forward
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d1)>
+#map = affine_map<() -> ()>
 module attributes {torch.debug_module_name = "GraphModule"} {
   func.func private @Unknown0(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} {
+    %c20 = arith.constant 20 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<2x20xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      %3 = arith.maxnumf %2, %cst : f32
-      linalg.yield %3 : f32
-    } -> tensor<2x20xf32>
-    return %1 : tensor<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0 = tensor.empty() : tensor<2x20xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      %3 = arith.maxnumf %2, %cst : f32
-      linalg.yield %3 : f32
-    } -> tensor<2x20xf32>
+    %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x20xf32>) {
+      %2 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x20xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<20xf32> to tensor<f32>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x20xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f32>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %in_1: f32, %out: f32):
+          %5 = arith.addf %in_1, %in : f32
+          %6 = arith.maximumf %5, %cst : f32
+          linalg.yield %6 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x20xf32>
+        scf.yield %inserted_slice : tensor<2x20xf32>
+      }
+      scf.yield %2 : tensor<2x20xf32>
+    }
     return %1 : tensor<2x20xf32>
   }
   func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} {
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<2x10xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x10xf32>, tensor<10xf32>) outs(%0 : tensor<2x10xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      linalg.yield %2 : f32
-    } -> tensor<2x10xf32>
+    %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x10xf32>) {
+      %2 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x10xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<10xf32> to tensor<f32>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x10xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f32>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %in_1: f32, %out: f32):
+          %5 = arith.addf %in_1, %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x10xf32>
+        scf.yield %inserted_slice : tensor<2x10xf32>
+      }
+      scf.yield %2 : tensor<2x10xf32>
+    }
     return %1 : tensor<2x10xf32>
   }
   func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> {
-    %0 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
-    %1 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
-    %2 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
-    %3 = mhlo.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>
-    %4 = mhlo.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>
-    %5 = mhlo.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>
-    %6 = "mhlo.dot"(%arg0, %5) : (tensor<2x10xf32>, tensor<10x20xf32>) -> tensor<2x20xf32>
-    %7 = call @Unknown0(%0, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
-    %8 = "mhlo.dot"(%7, %4) : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32>
-    %9 = call @Unknown1(%1, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
-    %10 = "mhlo.dot"(%9, %3) : (tensor<2x20xf32>, tensor<20x10xf32>) -> tensor<2x10xf32>
-    %11 = call @Unknown2(%2, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
+    %0 = mhlo.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>
+    %1 = mhlo.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
+    %2 = mhlo.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>
+    %3 = mhlo.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
+    %4 = mhlo.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>
+    %5 = mhlo.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
+    %6 = "mhlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x10xf32>, tensor<20x10xf32>) -> tensor<2x20xf32>
+    %7 = call @Unknown0(%1, %6) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %8 = "mhlo.dot_general"(%7, %2) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x20xf32>, tensor<20x20xf32>) -> tensor<2x20xf32>
+    %9 = call @Unknown0(%3, %8) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %10 = "mhlo.dot_general"(%9, %4) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>} : (tensor<2x20xf32>, tensor<10x20xf32>) -> tensor<2x10xf32>
+    %11 = call @Unknown2(%5, %10) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
     return %11 : tensor<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir b/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir
index e204402b4..d9816f589 100644
--- a/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir
+++ b/compiler/test/E2E/MLPInference/4_bufferize_opt.mlir
@@ -2,56 +2,72 @@
 
 // CHECK-LABEL: func.func @forward
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d1)>
+#map = affine_map<() -> ()>
 module attributes {torch.debug_module_name = "GraphModule"} {
   func.func private @Unknown0(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} {
+    %c20 = arith.constant 20 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<2x20xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      %3 = arith.maxnumf %2, %cst : f32
-      linalg.yield %3 : f32
-    } -> tensor<2x20xf32>
-    return %1 : tensor<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: tensor<20xf32>, %arg1: tensor<2x20xf32>) -> tensor<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0 = tensor.empty() : tensor<2x20xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x20xf32>, tensor<20xf32>) outs(%0 : tensor<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      %3 = arith.maxnumf %2, %cst : f32
-      linalg.yield %3 : f32
-    } -> tensor<2x20xf32>
+    %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x20xf32>) {
+      %2 = scf.for %arg4 = %c0 to %c20 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x20xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<20xf32> to tensor<f32>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x20xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f32>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %in_1: f32, %out: f32):
+          %5 = arith.addf %in_1, %in : f32
+          %6 = arith.maximumf %5, %cst : f32
+          linalg.yield %6 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x20xf32>
+        scf.yield %inserted_slice : tensor<2x20xf32>
+      }
+      scf.yield %2 : tensor<2x20xf32>
+    }
     return %1 : tensor<2x20xf32>
   }
   func.func private @Unknown2(%arg0: tensor<10xf32>, %arg1: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__byteir_elementwise_fusion__} {
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<2x10xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<2x10xf32>, tensor<10xf32>) outs(%0 : tensor<2x10xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %2 = arith.addf %in, %in_0 : f32
-      linalg.yield %2 : f32
-    } -> tensor<2x10xf32>
+    %1 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0) -> (tensor<2x10xf32>) {
+      %2 = scf.for %arg4 = %c0 to %c10 step %c1 iter_args(%arg5 = %arg3) -> (tensor<2x10xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<10xf32> to tensor<f32>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<2x10xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f32>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f32, %in_1: f32, %out: f32):
+          %5 = arith.addf %in_1, %in : f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x10xf32>
+        scf.yield %inserted_slice : tensor<2x10xf32>
+      }
+      scf.yield %2 : tensor<2x10xf32>
+    }
     return %1 : tensor<2x10xf32>
   }
   func.func @forward(%arg0: tensor<2x10xf32>) -> tensor<2x10xf32> attributes {__placeholder__byre.entry_point} {
-    %cst = arith.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
-    %cst_0 = arith.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
-    %cst_1 = arith.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
-    %cst_2 = arith.constant dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>
-    %cst_3 = arith.constant dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>
-    %cst_4 = arith.constant dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>
+    %cst = arith.constant dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>
+    %cst_0 = arith.constant dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>
+    %cst_1 = arith.constant dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>
+    %cst_2 = arith.constant dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>
+    %cst_3 = arith.constant dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>
+    %cst_4 = arith.constant dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>
     %0 = tensor.empty() : tensor<2x20xf32>
-    %1 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%arg0, %cst_4 : tensor<2x10xf32>, tensor<10x20xf32>) outs(%0 : tensor<2x20xf32>) : tensor<2x20xf32>
-    %2 = call @Unknown0(%cst, %1) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %1 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%arg0, %cst : tensor<2x10xf32>, tensor<20x10xf32>) outs(%0 : tensor<2x20xf32>) : tensor<2x20xf32>
+    %2 = call @Unknown0(%cst_0, %1) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
     %3 = tensor.empty() : tensor<2x20xf32>
-    %4 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%2, %cst_3 : tensor<2x20xf32>, tensor<20x20xf32>) outs(%3 : tensor<2x20xf32>) : tensor<2x20xf32>
-    %5 = call @Unknown1(%cst_0, %4) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
+    %4 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%2, %cst_1 : tensor<2x20xf32>, tensor<20x20xf32>) outs(%3 : tensor<2x20xf32>) : tensor<2x20xf32>
+    %5 = call @Unknown0(%cst_2, %4) : (tensor<20xf32>, tensor<2x20xf32>) -> tensor<2x20xf32>
     %6 = tensor.empty() : tensor<2x10xf32>
-    %7 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%5, %cst_2 : tensor<2x20xf32>, tensor<20x10xf32>) outs(%6 : tensor<2x10xf32>) : tensor<2x10xf32>
-    %8 = call @Unknown2(%cst_1, %7) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
+    %7 = byre.compute_on_tensor @MatmulOp_f32f32_f32 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%5, %cst_3 : tensor<2x20xf32>, tensor<10x20xf32>) outs(%6 : tensor<2x10xf32>) : tensor<2x10xf32>
+    %8 = call @Unknown2(%cst_4, %7) : (tensor<10xf32>, tensor<2x10xf32>) -> tensor<2x10xf32>
     return %8 : tensor<2x10xf32>
   }
-}
+}
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/5_affine_opt.mlir b/compiler/test/E2E/MLPInference/5_affine_opt.mlir
index aeb9723dd..4c8f4570e 100644
--- a/compiler/test/E2E/MLPInference/5_affine_opt.mlir
+++ b/compiler/test/E2E/MLPInference/5_affine_opt.mlir
@@ -2,62 +2,72 @@
 
 // CHECK-LABEL: func.func @forward
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d1)>
+#map = affine_map<() -> ()>
 module attributes {torch.debug_module_name = "GraphModule"} {
-  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
   func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c20 = arith.constant 20 : index
     %alloc = memref.alloc() : memref<2x20xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      %1 = arith.maxnumf %0, %cst : f32
-      linalg.yield %1 : f32
-    }
-    return %alloc : memref<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %alloc = memref.alloc() : memref<2x20xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      %1 = arith.maxnumf %0, %cst : f32
-      linalg.yield %1 : f32
+    scf.for %arg2 = %c0 to %c2 step %c1 {
+      scf.for %arg3 = %c0 to %c20 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<20xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %in_2: f32, %out: f32):
+          %0 = arith.addf %in_2, %in : f32
+          %1 = arith.maximumf %0, %cst : f32
+          linalg.yield %1 : f32
+        }
+      }
     }
     return %alloc : memref<2x20xf32>
   }
   func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
     %alloc = memref.alloc() : memref<2x10xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x10xf32>, memref<10xf32>) outs(%alloc : memref<2x10xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      linalg.yield %0 : f32
+    scf.for %arg2 = %c0 to %c2 step %c1 {
+      scf.for %arg3 = %c0 to %c10 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<10xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %in_2: f32, %out: f32):
+          %0 = arith.addf %in_2, %in : f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<2x10xf32>
   }
   func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} {
-    %0 = memref.get_global @__constant_20xf32 : memref<20xf32>
-    %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
-    %2 = memref.get_global @__constant_10xf32 : memref<10xf32>
-    %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
-    %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
-    %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
+    %1 = memref.get_global @__constant_20xf32 : memref<20xf32>
+    %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
+    %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
+    %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %5 = memref.get_global @__constant_10xf32 : memref<10xf32>
     %alloc = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32>
-    %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32>
+    %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_0 = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
-    %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
+    %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_1 = memref.alloc() : memref<2x10xf32>
-    byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32>
-    %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
+    byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32>
+    %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
     return %8 : memref<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir b/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir
index 08dda3cc8..6e8595988 100644
--- a/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir
+++ b/compiler/test/E2E/MLPInference/5_alternative_scf_opt.mlir
@@ -2,62 +2,72 @@
 
 // CHECK-LABEL: func.func @forward
 
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d1)>
+#map = affine_map<() -> ()>
 module attributes {torch.debug_module_name = "GraphModule"} {
-  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
   func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c20 = arith.constant 20 : index
     %alloc = memref.alloc() : memref<2x20xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      %1 = arith.maxnumf %0, %cst : f32
-      linalg.yield %1 : f32
-    }
-    return %alloc : memref<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %alloc = memref.alloc() : memref<2x20xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x20xf32>, memref<20xf32>) outs(%alloc : memref<2x20xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      %1 = arith.maxnumf %0, %cst : f32
-      linalg.yield %1 : f32
+    scf.for %arg2 = %c0 to %c2 step %c1 {
+      scf.for %arg3 = %c0 to %c20 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<20xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x20xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %in_2: f32, %out: f32):
+          %0 = arith.addf %in_2, %in : f32
+          %1 = arith.maximumf %0, %cst : f32
+          linalg.yield %1 : f32
+        }
+      }
     }
     return %alloc : memref<2x20xf32>
   }
   func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
     %alloc = memref.alloc() : memref<2x10xf32>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<2x10xf32>, memref<10xf32>) outs(%alloc : memref<2x10xf32>) {
-    ^bb0(%in: f32, %in_0: f32, %out: f32):
-      %0 = arith.addf %in, %in_0 : f32
-      linalg.yield %0 : f32
+    scf.for %arg2 = %c0 to %c2 step %c1 {
+      scf.for %arg3 = %c0 to %c10 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<10xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<2x10xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %in_2: f32, %out: f32):
+          %0 = arith.addf %in_2, %in : f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<2x10xf32>
   }
   func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} {
-    %0 = memref.get_global @__constant_20xf32 : memref<20xf32>
-    %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
-    %2 = memref.get_global @__constant_10xf32 : memref<10xf32>
-    %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
-    %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
-    %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
+    %1 = memref.get_global @__constant_20xf32 : memref<20xf32>
+    %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
+    %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
+    %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %5 = memref.get_global @__constant_10xf32 : memref<10xf32>
     %alloc = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32>
-    %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32>
+    %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_0 = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
-    %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
+    %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_1 = memref.alloc() : memref<2x10xf32>
-    byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32>
-    %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
+    byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32>
+    %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
     return %8 : memref<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/6_gpu_opt.mlir b/compiler/test/E2E/MLPInference/6_gpu_opt.mlir
index a3052a076..1cfadee5a 100644
--- a/compiler/test/E2E/MLPInference/6_gpu_opt.mlir
+++ b/compiler/test/E2E/MLPInference/6_gpu_opt.mlir
@@ -3,107 +3,62 @@
 // CHECK-LABEL: func.func @forward
 
 module attributes {torch.debug_module_name = "GraphModule"} {
-  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
   func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %c0 = arith.constant 0 : index
-    %c40 = arith.constant 40 : index
-    %c1 = arith.constant 1 : index
     %c20 = arith.constant 20 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<2x20xf32>
-    scf.for %arg2 = %c0 to %c40 step %c1 {
-      %0 = arith.remsi %arg2, %c20 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c20 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c20 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg1[%9, %3] : memref<2x20xf32>
-      %11 = memref.load %arg0[%3] : memref<20xf32>
-      %12 = arith.addf %10, %11 : f32
-      %13 = arith.maxnumf %12, %cst : f32
-      memref.store %13, %alloc[%9, %3] : memref<2x20xf32>
-    }
-    return %alloc : memref<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f32
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f32
     %c40 = arith.constant 40 : index
-    %c1 = arith.constant 1 : index
-    %c20 = arith.constant 20 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<2x20xf32>
     scf.for %arg2 = %c0 to %c40 step %c1 {
       %0 = arith.remsi %arg2, %c20 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c20 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c20 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg1[%9, %3] : memref<2x20xf32>
-      %11 = memref.load %arg0[%3] : memref<20xf32>
-      %12 = arith.addf %10, %11 : f32
-      %13 = arith.maxnumf %12, %cst : f32
-      memref.store %13, %alloc[%9, %3] : memref<2x20xf32>
+      %1 = arith.divsi %arg2, %c20 : index
+      %2 = memref.load %arg0[%0] : memref<20xf32>
+      %3 = memref.load %arg1[%1, %0] : memref<2x20xf32>
+      %4 = arith.addf %3, %2 : f32
+      %5 = arith.maximumf %4, %cst : f32
+      memref.store %5, %alloc[%1, %0] : memref<2x20xf32>
     }
     return %alloc : memref<2x20xf32>
   }
   func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byteir_elementwise_fusion__} {
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c20 = arith.constant 20 : index
-    %c1 = arith.constant 1 : index
-    %c10 = arith.constant 10 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<2x10xf32>
     scf.for %arg2 = %c0 to %c20 step %c1 {
       %0 = arith.remsi %arg2, %c10 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c10 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c10 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg1[%9, %3] : memref<2x10xf32>
-      %11 = memref.load %arg0[%3] : memref<10xf32>
-      %12 = arith.addf %10, %11 : f32
-      memref.store %12, %alloc[%9, %3] : memref<2x10xf32>
+      %1 = arith.divsi %arg2, %c10 : index
+      %2 = memref.load %arg0[%0] : memref<10xf32>
+      %3 = memref.load %arg1[%1, %0] : memref<2x10xf32>
+      %4 = arith.addf %3, %2 : f32
+      memref.store %4, %alloc[%1, %0] : memref<2x10xf32>
     }
     return %alloc : memref<2x10xf32>
   }
   func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} {
-    %0 = memref.get_global @__constant_20xf32 : memref<20xf32>
-    %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
-    %2 = memref.get_global @__constant_10xf32 : memref<10xf32>
-    %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
-    %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
-    %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
+    %1 = memref.get_global @__constant_20xf32 : memref<20xf32>
+    %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
+    %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
+    %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %5 = memref.get_global @__constant_10xf32 : memref<10xf32>
     %alloc = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32>
-    %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32>
+    %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_0 = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
-    %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
+    %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_1 = memref.alloc() : memref<2x10xf32>
-    byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32>
-    %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
+    byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32>
+    %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
     return %8 : memref<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/7_set_space_opt.mlir b/compiler/test/E2E/MLPInference/7_set_space_opt.mlir
index 5e9cb9cbd..5f0671566 100644
--- a/compiler/test/E2E/MLPInference/7_set_space_opt.mlir
+++ b/compiler/test/E2E/MLPInference/7_set_space_opt.mlir
@@ -1,144 +1,88 @@
-// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=forward space=cuda" -set-arg-space="entry-func=forward all-space=cuda" | FileCheck %s
+// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=forward space=cuda" -set-arg-space="entry-func=forward all-space=cuda" | FileCheck %s
 
 // CHECK-LABEL: func.func @forward
 
 module attributes {gpu.container_module, torch.debug_module_name = "GraphModule"} {
   gpu.module @unified {
     gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c10 = arith.constant 10 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c20 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c10 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c10 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c10 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x10xf32>
-        %17 = memref.load %arg0[%9] : memref<10xf32>
-        %18 = arith.addf %16, %17 : f32
-        memref.store %18, %arg2[%15, %9] : memref<2x10xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
-      %c40 = arith.constant 40 : index
-      %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c20 step %6 {
+        %7 = arith.remsi %arg3, %c10 : index
+        %8 = arith.divsi %arg3, %c10 : index
+        %9 = memref.load %arg0[%7] : memref<10xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x10xf32>
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%8, %7] : memref<2x10xf32>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
       %c40 = arith.constant 40 : index
+      %cst = arith.constant 0.000000e+00 : f32
       %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c40 step %6 {
+        %7 = arith.remsi %arg3, %c20 : index
+        %8 = arith.divsi %arg3, %c20 : index
+        %9 = memref.load %arg0[%7] : memref<20xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x20xf32>
+        %11 = arith.addf %10, %9 : f32
+        %12 = arith.maximumf %11, %cst : f32
+        memref.store %12, %arg2[%8, %7] : memref<2x20xf32>
       }
       gpu.return
     }
   }
-  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32 : memref<10xf32> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32 : memref<10x20xf32> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0 : memref<20xf32> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32 : memref<20x20xf32> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32 : memref<20xf32> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
-  func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<2x20xf32>
-    gpu.launch_func  @unified::@Unknown0 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>)
-    return %alloc : memref<2x20xf32>
-  }
-  func.func private @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  memref.global "private" constant @__constant_20x10xf32 : memref<20x10xf32> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
+  func.func private @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>) -> memref<2x20xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<2x20xf32>
-    gpu.launch_func  @unified::@Unknown1 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>)
+    gpu.launch_func  @unified::@Unknown0 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<20xf32>, %arg1 : memref<2x20xf32>, %alloc : memref<2x20xf32>)
     return %alloc : memref<2x20xf32>
   }
-  func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>) -> memref<2x10xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<2x10xf32>
-    gpu.launch_func  @unified::@Unknown2 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<10xf32>, %arg1 : memref<2x10xf32>, %alloc : memref<2x10xf32>)
+    gpu.launch_func  @unified::@Unknown2 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<10xf32>, %arg1 : memref<2x10xf32>, %alloc : memref<2x10xf32>)
     return %alloc : memref<2x10xf32>
   }
   func.func @forward(%arg0: memref<2x10xf32>) -> memref<2x10xf32> attributes {__placeholder__byre.entry_point} {
-    %0 = memref.get_global @__constant_20xf32 : memref<20xf32>
-    %1 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
-    %2 = memref.get_global @__constant_10xf32 : memref<10xf32>
-    %3 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
-    %4 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
-    %5 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %0 = memref.get_global @__constant_20x10xf32 : memref<20x10xf32>
+    %1 = memref.get_global @__constant_20xf32 : memref<20xf32>
+    %2 = memref.get_global @__constant_20x20xf32 : memref<20x20xf32>
+    %3 = memref.get_global @__constant_20xf32_0 : memref<20xf32>
+    %4 = memref.get_global @__constant_10x20xf32 : memref<10x20xf32>
+    %5 = memref.get_global @__constant_10xf32 : memref<10xf32>
     %alloc = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32>, memref<10x20xf32>, memref<2x20xf32>
-    %6 = call @Unknown0(%0, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32>, memref<20x10xf32>, memref<2x20xf32>
+    %6 = call @Unknown0(%1, %alloc) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_0 = memref.alloc() : memref<2x20xf32>
-    byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
-    %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
+    byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<20x20xf32>, memref<2x20xf32>
+    %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32>, memref<2x20xf32>) -> memref<2x20xf32>
     %alloc_1 = memref.alloc() : memref<2x10xf32>
-    byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32>, memref<20x10xf32>, memref<2x10xf32>
-    %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
+    byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32>, memref<10x20xf32>, memref<2x10xf32>
+    %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32>, memref<2x10xf32>) -> memref<2x10xf32>
     return %8 : memref<2x10xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/8_byre_opt.mlir b/compiler/test/E2E/MLPInference/8_byre_opt.mlir
index a3557aabd..549239c4e 100644
--- a/compiler/test/E2E/MLPInference/8_byre_opt.mlir
+++ b/compiler/test/E2E/MLPInference/8_byre_opt.mlir
@@ -5,122 +5,72 @@
 module attributes {gpu.container_module, torch.debug_module_name = "GraphModule"} {
   gpu.module @unified {
     gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c10 = arith.constant 10 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c20 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c10 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c10 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c10 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x10xf32>
-        %17 = memref.load %arg0[%9] : memref<10xf32>
-        %18 = arith.addf %16, %17 : f32
-        memref.store %18, %arg2[%15, %9] : memref<2x10xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
-      %c40 = arith.constant 40 : index
-      %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c20 step %6 {
+        %7 = arith.remsi %arg3, %c10 : index
+        %8 = arith.divsi %arg3, %c10 : index
+        %9 = memref.load %arg0[%7] : memref<10xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x10xf32>
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%8, %7] : memref<2x10xf32>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
       %c40 = arith.constant 40 : index
+      %cst = arith.constant 0.000000e+00 : f32
       %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c40 step %6 {
+        %7 = arith.remsi %arg3, %c20 : index
+        %8 = arith.divsi %arg3, %c20 : index
+        %9 = memref.load %arg0[%7] : memref<20xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x20xf32>
+        %11 = arith.addf %10, %9 : f32
+        %12 = arith.maximumf %11, %cst : f32
+        memref.store %12, %arg2[%8, %7] : memref<2x20xf32>
       }
       gpu.return
     }
   }
-  func.func private @Unknown0(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown1(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown2(memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown0(memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown2(memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown2", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
   func.func @forward(%arg0: memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda"> attributes {__placeholder__byre.entry_point} {
-    %0 = memref.get_global @__constant_20xf32_cuda : memref<20xf32, "cuda"> {device = "cuda"}
-    %1 = memref.get_global @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> {device = "cuda"}
-    %2 = memref.get_global @__constant_10xf32_cuda : memref<10xf32, "cuda"> {device = "cuda"}
-    %3 = memref.get_global @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> {device = "cuda"}
-    %4 = memref.get_global @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> {device = "cuda"}
-    %5 = memref.get_global @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> {device = "cuda"}
+    %0 = memref.get_global @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> {device = "cuda"}
+    %1 = memref.get_global @__constant_20xf32_cuda : memref<20xf32, "cuda"> {device = "cuda"}
+    %2 = memref.get_global @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> {device = "cuda"}
+    %3 = memref.get_global @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> {device = "cuda"}
+    %4 = memref.get_global @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> {device = "cuda"}
+    %5 = memref.get_global @__constant_10xf32_cuda : memref<10xf32, "cuda"> {device = "cuda"}
     %alloc = memref.alloc() : memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %5, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %6 = call @Unknown0(%0, %alloc) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%arg0, %0, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda">
+    %6 = call @Unknown0(%1, %alloc) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda">
     %alloc_0 = memref.alloc() : memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%6, %4, %alloc_0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %7 = call @Unknown1(%1, %alloc_0) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%6, %2, %alloc_0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    %7 = call @Unknown0(%3, %alloc_0) : (memref<20xf32, "cuda">, memref<2x20xf32, "cuda">) -> memref<2x20xf32, "cuda">
     %alloc_1 = memref.alloc() : memref<2x10xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%7, %3, %alloc_1) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda">
-    %8 = call @Unknown2(%2, %alloc_1) : (memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%7, %4, %alloc_1) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda">
+    %8 = call @Unknown2(%5, %alloc_1) : (memref<10xf32, "cuda">, memref<2x10xf32, "cuda">) -> memref<2x10xf32, "cuda">
     return %8 : memref<2x10xf32, "cuda">
   }
-  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/9a_byre_host.mlir b/compiler/test/E2E/MLPInference/9a_byre_host.mlir
index 68f78a4cb..2e66696ba 100644
--- a/compiler/test/E2E/MLPInference/9a_byre_host.mlir
+++ b/compiler/test/E2E/MLPInference/9a_byre_host.mlir
@@ -5,126 +5,65 @@
 module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} {
   gpu.module @unified {
     gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c10 = arith.constant 10 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c20 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c10 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c10 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c10 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x10xf32>
-        %17 = memref.load %arg0[%9] : memref<10xf32>
-        %18 = arith.addf %16, %17 : f32
-        memref.store %18, %arg2[%15, %9] : memref<2x10xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
-      %c40 = arith.constant 40 : index
-      %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c20 step %6 {
+        %7 = arith.remsi %arg3, %c10 : index
+        %8 = arith.divsi %arg3, %c10 : index
+        %9 = memref.load %arg0[%7] : memref<10xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x10xf32>
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%8, %7] : memref<2x10xf32>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
       %c40 = arith.constant 40 : index
+      %cst = arith.constant 0.000000e+00 : f32
       %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c40 step %6 {
+        %7 = arith.remsi %arg3, %c20 : index
+        %8 = arith.divsi %arg3, %c20 : index
+        %9 = memref.load %arg0[%7] : memref<20xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x20xf32>
+        %11 = arith.addf %10, %9 : f32
+        %12 = arith.maximumf %11, %cst : f32
+        memref.store %12, %arg2[%8, %7] : memref<2x20xf32>
       }
       gpu.return
     }
   }
-  func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<320xi8, "cuda">
-    %alloc_0 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_0) {memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_1 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_1) {memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_2 = memref.alloc() : memref<10xf32, "cuda">
-    byre.compute @FillOp(%alloc_2) {memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda">
-    %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda">
-    byre.compute @FillOp(%alloc_3) {memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda">
-    %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_4) {memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda">
-    %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_5) {memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda">
-    byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
+  func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
+    %alloc = memref.alloc() : memref<512xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda">
+    byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
     return
   }
-  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir b/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir
index 2e52dd556..bdaa7894b 100644
--- a/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir
+++ b/compiler/test/E2E/MLPInference/9b_nvvm_codegen.mlir
@@ -5,126 +5,65 @@
 module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} {
   gpu.module @unified {
     gpu.func @Unknown2(%arg0: memref<10xf32>, %arg1: memref<2x10xf32>, %arg2: memref<2x10xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c20 = arith.constant 20 : index
       %c10 = arith.constant 10 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c20 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c10 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c10 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c10 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x10xf32>
-        %17 = memref.load %arg0[%9] : memref<10xf32>
-        %18 = arith.addf %16, %17 : f32
-        memref.store %18, %arg2[%15, %9] : memref<2x10xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
-      %c40 = arith.constant 40 : index
-      %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c20 step %6 {
+        %7 = arith.remsi %arg3, %c10 : index
+        %8 = arith.divsi %arg3, %c10 : index
+        %9 = memref.load %arg0[%7] : memref<10xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x10xf32>
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%8, %7] : memref<2x10xf32>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<20xf32>, %arg1: memref<2x20xf32>, %arg2: memref<2x20xf32>) kernel {
-      %cst = arith.constant 0.000000e+00 : f32
-      %c0 = arith.constant 0 : index
       %c40 = arith.constant 40 : index
+      %cst = arith.constant 0.000000e+00 : f32
       %c20 = arith.constant 20 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c40 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c20 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c20 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c20 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<2x20xf32>
-        %17 = memref.load %arg0[%9] : memref<20xf32>
-        %18 = arith.addf %16, %17 : f32
-        %19 = arith.maxnumf %18, %cst : f32
-        memref.store %19, %arg2[%15, %9] : memref<2x20xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c40 step %6 {
+        %7 = arith.remsi %arg3, %c20 : index
+        %8 = arith.divsi %arg3, %c20 : index
+        %9 = memref.load %arg0[%7] : memref<20xf32>
+        %10 = memref.load %arg1[%8, %7] : memref<2x20xf32>
+        %11 = arith.addf %10, %9 : f32
+        %12 = arith.maximumf %11, %cst : f32
+        memref.store %12, %arg2[%8, %7] : memref<2x20xf32>
       }
       gpu.return
     }
   }
-  func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<320xi8, "cuda">
-    %alloc_0 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_0) {memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_1 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_1) {memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_2 = memref.alloc() : memref<10xf32, "cuda">
-    byre.compute @FillOp(%alloc_2) {memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda">
-    %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda">
-    byre.compute @FillOp(%alloc_3) {memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda">
-    %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_4) {memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda">
-    %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_5) {memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda">
-    byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
+  func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
+    %alloc = memref.alloc() : memref<512xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda">
+    byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
     return
   }
-  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C">
-  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE">
-  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D">
   memref.global "private" constant @__constant_10xf32_cuda : memref<10xf32, "cuda"> = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]>
+  memref.global "private" constant @__constant_10x20xf32_cuda : memref<10x20xf32, "cuda"> = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D">
   memref.global "private" constant @__constant_20xf32_0_cuda : memref<20xf32, "cuda"> = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]>
+  memref.global "private" constant @__constant_20x20xf32_cuda : memref<20x20xf32, "cuda"> = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE">
   memref.global "private" constant @__constant_20xf32_cuda : memref<20xf32, "cuda"> = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]>
+  memref.global "private" constant @__constant_20x10xf32_cuda : memref<20x10xf32, "cuda"> = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C">
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/MLPInference/device_output.ptx b/compiler/test/E2E/MLPInference/device_output.ptx
index 06cdbbcdd..8a25dc663 100644
--- a/compiler/test/E2E/MLPInference/device_output.ptx
+++ b/compiler/test/E2E/MLPInference/device_output.ptx
@@ -31,126 +31,48 @@
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b32 	%r<4>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<4>;
-	.reg .b64 	%rd<32>;
+	.reg .b64 	%rd<29>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 19;
-	@%p1 bra 	$L__BB0_2;
-	ld.param.u64 	%rd5, [Unknown2_param_13];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown2_param_1];
-	ld.param.u64 	%rd7, [Unknown2_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 7378697629483820647;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 2;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 10;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 10;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 7378697629483820647;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.u64 	%rd22, %rd20, 2;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.lo.s64 	%rd25, %rd24, 10;
-	add.s64 	%rd26, %rd25, %rd17;
-	shl.b64 	%rd27, %rd26, 2;
-	add.s64 	%rd28, %rd2, %rd27;
-	ld.global.f32 	%f1, [%rd28];
-	shl.b64 	%rd29, %rd17, 2;
-	add.s64 	%rd30, %rd3, %rd29;
-	ld.global.f32 	%f2, [%rd30];
-	add.rn.f32 	%f3, %f1, %f2;
-	add.s64 	%rd31, %rd1, %rd27;
-	st.global.f32 	[%rd31], %f3;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd28, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd28, 19;
+	@%p1 bra 	$L__BB0_3;
+	ld.param.u64 	%rd12, [Unknown2_param_13];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown2_param_1];
+	ld.param.u64 	%rd14, [Unknown2_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd27, %rd28, 2;
+	shl.b64 	%rd7, %rd5, 2;
 $L__BB0_2:
-	ret;
-
-}
-	// .globl	Unknown1
-.visible .entry Unknown1(
-	.param .u64 Unknown1_param_0,
-	.param .u64 Unknown1_param_1,
-	.param .u64 Unknown1_param_2,
-	.param .u64 Unknown1_param_3,
-	.param .u64 Unknown1_param_4,
-	.param .u64 Unknown1_param_5,
-	.param .u64 Unknown1_param_6,
-	.param .u64 Unknown1_param_7,
-	.param .u64 Unknown1_param_8,
-	.param .u64 Unknown1_param_9,
-	.param .u64 Unknown1_param_10,
-	.param .u64 Unknown1_param_11,
-	.param .u64 Unknown1_param_12,
-	.param .u64 Unknown1_param_13,
-	.param .u64 Unknown1_param_14,
-	.param .u64 Unknown1_param_15,
-	.param .u64 Unknown1_param_16,
-	.param .u64 Unknown1_param_17,
-	.param .u64 Unknown1_param_18
-)
-{
-	.reg .pred 	%p<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<5>;
-	.reg .b64 	%rd<32>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 39;
-	@%p1 bra 	$L__BB1_2;
-	ld.param.u64 	%rd5, [Unknown1_param_13];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown1_param_1];
-	ld.param.u64 	%rd7, [Unknown1_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 7378697629483820647;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 20;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 20;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 7378697629483820647;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.lo.s64 	%rd25, %rd24, 20;
-	add.s64 	%rd26, %rd25, %rd17;
-	shl.b64 	%rd27, %rd26, 2;
-	add.s64 	%rd28, %rd2, %rd27;
-	ld.global.f32 	%f1, [%rd28];
-	shl.b64 	%rd29, %rd17, 2;
-	add.s64 	%rd30, %rd3, %rd29;
-	ld.global.f32 	%f2, [%rd30];
+	mul.hi.s64 	%rd17, %rd28, 7378697629483820647;
+	shr.u64 	%rd18, %rd17, 63;
+	shr.u64 	%rd19, %rd17, 2;
+	add.s64 	%rd20, %rd19, %rd18;
+	mul.lo.s64 	%rd21, %rd20, 10;
+	sub.s64 	%rd22, %rd28, %rd21;
+	shl.b64 	%rd23, %rd22, 2;
+	add.s64 	%rd24, %rd3, %rd23;
+	ld.global.nc.f32 	%f1, [%rd24];
+	add.s64 	%rd25, %rd2, %rd27;
+	ld.global.nc.f32 	%f2, [%rd25];
 	add.rn.f32 	%f3, %f1, %f2;
-	max.f32 	%f4, %f3, 0f00000000;
-	add.s64 	%rd31, %rd1, %rd27;
-	st.global.f32 	[%rd31], %f4;
-$L__BB1_2:
+	add.s64 	%rd26, %rd1, %rd27;
+	st.global.f32 	[%rd26], %f3;
+	add.s64 	%rd28, %rd28, %rd5;
+	add.s64 	%rd27, %rd27, %rd7;
+	setp.lt.s64 	%p2, %rd28, 20;
+	@%p2 bra 	$L__BB0_2;
+$L__BB0_3:
 	ret;
 
 }
@@ -178,53 +100,49 @@ $L__BB1_2:
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b32 	%r<4>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<5>;
-	.reg .b64 	%rd<32>;
+	.reg .b64 	%rd<29>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 39;
-	@%p1 bra 	$L__BB2_2;
-	ld.param.u64 	%rd5, [Unknown0_param_13];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown0_param_1];
-	ld.param.u64 	%rd7, [Unknown0_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 7378697629483820647;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 20;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 20;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 7378697629483820647;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.lo.s64 	%rd25, %rd24, 20;
-	add.s64 	%rd26, %rd25, %rd17;
-	shl.b64 	%rd27, %rd26, 2;
-	add.s64 	%rd28, %rd2, %rd27;
-	ld.global.f32 	%f1, [%rd28];
-	shl.b64 	%rd29, %rd17, 2;
-	add.s64 	%rd30, %rd3, %rd29;
-	ld.global.f32 	%f2, [%rd30];
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd28, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd28, 39;
+	@%p1 bra 	$L__BB1_3;
+	ld.param.u64 	%rd12, [Unknown0_param_13];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown0_param_1];
+	ld.param.u64 	%rd14, [Unknown0_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd27, %rd28, 2;
+	shl.b64 	%rd7, %rd5, 2;
+$L__BB1_2:
+	mul.hi.s64 	%rd17, %rd28, 7378697629483820647;
+	shr.u64 	%rd18, %rd17, 63;
+	shr.s64 	%rd19, %rd17, 3;
+	add.s64 	%rd20, %rd19, %rd18;
+	mul.lo.s64 	%rd21, %rd20, 20;
+	sub.s64 	%rd22, %rd28, %rd21;
+	shl.b64 	%rd23, %rd22, 2;
+	add.s64 	%rd24, %rd3, %rd23;
+	ld.global.nc.f32 	%f1, [%rd24];
+	add.s64 	%rd25, %rd2, %rd27;
+	ld.global.nc.f32 	%f2, [%rd25];
 	add.rn.f32 	%f3, %f1, %f2;
-	max.f32 	%f4, %f3, 0f00000000;
-	add.s64 	%rd31, %rd1, %rd27;
-	st.global.f32 	[%rd31], %f4;
-$L__BB2_2:
+	max.NaN.f32 	%f4, %f3, 0f00000000;
+	add.s64 	%rd26, %rd1, %rd27;
+	st.global.f32 	[%rd26], %f4;
+	add.s64 	%rd28, %rd28, %rd5;
+	add.s64 	%rd27, %rd27, %rd7;
+	setp.lt.s64 	%p2, %rd28, 40;
+	@%p2 bra 	$L__BB1_2;
+$L__BB1_3:
 	ret;
 
 }
diff --git a/compiler/test/E2E/MLPInference/host_output.mlir b/compiler/test/E2E/MLPInference/host_output.mlir
index 70f74be53..76b0c8720 100644
--- a/compiler/test/E2E/MLPInference/host_output.mlir
+++ b/compiler/test/E2E/MLPInference/host_output.mlir
@@ -3,29 +3,17 @@
 // CHECK-LABEL: func.func @forward
 
 module attributes {byre.container_module, gpu.container_module, torch.debug_module_name = "GraphModule"} {
-  func.func @forward(%arg0: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
-    %alloc = memref.alloc() : memref<320xi8, "cuda">
-    %alloc_0 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_0) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_1 = memref.alloc() : memref<20xf32, "cuda">
-    byre.compute @FillOp(%alloc_1) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>} : memref<20xf32, "cuda">
-    %alloc_2 = memref.alloc() : memref<10xf32, "cuda">
-    byre.compute @FillOp(%alloc_2) {device = "cuda", memory_effects = [2 : i32], value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>} : memref<10xf32, "cuda">
-    %alloc_3 = memref.alloc() : memref<20x10xf32, "cuda">
-    byre.compute @FillOp(%alloc_3) {device = "cuda", memory_effects = [2 : i32], value = dense<"0xD60ED23D6677593EEDD4253D579F523E952583BD454F62BE2C2AB53D827CF4BD7F4D41BD15896C3DDEFF3B3D5270AF3D461635BC07D5B8BDB82761BE33120CBE692C35BCDB70183EED6F4D3E271EAD3DFBC1653B77D0473E7920B43C2135C43C1780C2BBD145D43C15B1203DAAD3623C9A1808BECF144D3E7AFAC43D933CF8BDFE637BBD21BE53BEADCA233E250529BD4576FABD2B7B84BCCCB48FBC84B3D6BDAB54263EC31061BEF81913BE3E001C3E4D7A0E3EE4CE4EBE2119CFBDC6B6123EB3CA01BEB506BF3DDA0774BD37FD1E3D1EB44BBE38E409BE994A1FBE74FFDB3DBB05A93D29209A3D756B01BDEAA162BD602A03BE522F3FBE9C1DDC3D486F1EBEB99D21BEE05813BE074943BE4233E33C32BBFA3D81F31ABE747C53BEC1814A3EA77B363E551E113E65E5673D0EDAC4BDC9732E3DD45C3C3E1BCB113E9DD61DBDC754B93DC9CB88BCB5D84DBEF14E443ED54E1EBE659C343E349F62BEF5BBD73D1042FDBDBFDD4EBD306D813DE33C16BE39B00EBE946CD7BDB97CC9BD08A411BDC933103EDE725FBEF3C1553D830E3B3CBFD7253ECFBD0D3EDC6CD83DA6EA563D9238C53C1405B2BD4948D83C0A72B03DEC8AD93DD04F3A3E96216E3C0351123E034807BE0929E7BDD3E3453DCD388B3D68C034BCE7650E3E41C70D3E392C32BEEA885CBC19FFD9BD8B68F83C6FE3B5BCC302633E736E193ED8D1503EBEB64B3EE0263C3EAC4E223DB90511BE36A5443E86AA713C3698A3BCB129A93D517218BEA7BB51BECA19393E8948043E5DE08BBDA1F012BED3483E3EB3F664BEEB093DBEEF2FB7BC3243F6BC2EFA283EB41EF5BDF16A573D19E6E83DDCF4F23DA4D8C3BD4877DFBC1B13303B60330D3ECA5569BD97A090BD3A5E9C3D3565B63DE54F6BBDAC1B29BDDEB4CD3D6C5006BE61B08FBC5AD9DA3B80340EBE933AD83D6334E93CE5F18EBDCC3D73BD128D2ABD46CA34BDDDA5BABDE0C404BD533F223D3072B4BD7D7C413EB831DABD3359BBBB0A69993D6024AD3D0D0EE9BDBD5A80BC7FA6E83D400BFBBD616438BE34CBC73D4769D73D4E295EBEECBE843C838E9CBDA171FA3DDCED3EBE9FEDA6BDAD78A8BDD19702BD18B3043E39CF873C46E602BCD0CBC33D"> : tensor<20x10xf32>} : memref<20x10xf32, "cuda">
-    %alloc_4 = memref.alloc() : memref<20x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_4) {device = "cuda", memory_effects = [2 : i32], value = dense<"0x32A148BE93869BBDDF0E313EADAE1EBD5C52443E3506E7BBF2CFB3BD4CF5A13DBC209B3D697E213D4A14543DAB303B3EB6C1FA3B3524DBBCE1C81ABE9DDF51BEBF11173EFFDA4C3E6E8A2DBEF9F5013E6D99D1BDBEED34BDA2CFA0BCBFCE46BE85B19BBC4D7659BE398863BEC7A3DD3D590C1ABE4A942DBE9EB929BE6A0D20BE941C90BD097A76BC0640443EAE5B30BE723C573EC0EE403E0F2024BE3D24D53D3C565A3E62E5843DA45CAF3C41EC6DBBE2019E3D1D3A013E42C74CBE4E3CE63D734E60BE530B01BECD1844BEEC38BE3C19E862BEFC6F0DBEE58D4F3D03A7AA3D1DCF543C756D503E81D23BBEE0D914BE614360BCC40D4F3E5A313E3E34D6FDBD9C3519BE72500C3D1B39AABD9B2983BD110010BDAFAD27BCEAFFB5BD4A82103D8E7AD13CD2693FBE3E6125BE03FD743D3B3825BEB7FCD53DCCDA58BE2B3A4CBE1E9BDF3BFF38F73CC33C35BD40D157BEDD478A3CD1E6373E436BEEBD04BE2E3EBD5F2B3D1D7813BE0EF101BE0369A1BD8FF8D93D829D3A3E85CB2EBC6A2B023EEF8C273EB6F3C2BD27F955BDAE3B4ABC265A9D3D45B35D3E112654BB8BEEB6BDBD70993D7DDE62BEDB8017BEE2AC4FBE9F8E213E592C1B3E809B3DBE7DF30B3ED463133D672DEB3DEB5A44BEFFA2B03C894A33BE8A2935BE93A6223EE10741BEBD5F18BEDE0A0EBEEDE673BD23F447BEB37F3BBE052B503E5683343E13D6283EC693A03D751BB5BDF94F243E5014C23C003410BEEBBD613ECADD55BE56C2C3BD8922313DA41CF7BD0CF8D5BD13AA29BE8FE25B3ECC6652BE20FF4CBED78823BD32E5003E392A323DE1BAA13DFB3A9BBD9FF356BD3C4FE13DFDFA51BDB60E2C3EBEE61D3EC5EFDA3DE721A0BD13E8F3BC0C34F5BD3E911D3EFD7F38BD3C54933DC201E4BC35B79BBD09665BBE2F2FC8BDBEFA293E668CEDBD0F397ABDC1675D3EC0F1E03D164A5FBDAA43353C19AC2FBE1A2E90BC3881E6BDC3EA5EBE550D21BEB6B3023E1E97B0BD500C6DBD9577BB3D085E783C9044093D916844BE162F39BE955128BEB6F210BDA4C90BBDCD44643E511B3DBE8941593E30FEC73CBEC5D73D8FF2A2BDCB4F4B3E4F6DC2BB2249463E35888ABC80299C3D70BAFEBD40F617BEA941FFBDF13E123E4C305C3E5C838CBDF44A6EBD2A505EBEF0BFA83A8BC9E03DD0DA303D64522C3E58A96FBD7211593EC3BEA6BC14EF8ABDFE5949BE43880FBE46CE16BD097B633D7775193EB7E90F3ED2D3953DC271633D6BF8053DA6DFECBD5728953DB72E1DBD2B345DBD7053E83CE305B1BDF2F3013E0DE876BD8F1098BD7232E23D4CDE0C3DAF43A93D1A34DE3DFB1247BD569B3F3E09619BBD54502A3E6BA636BC17C3273EBE032E3E34F4173C54B10A3E9452EB3D82FDD33D005014BE578D51BDF3E116BE4EDE0F3E47BF58BEC35E793D7195303ED3A822BEA331963A9CF9403E1FD32DBD4EFCBB3DB47251BDD3A570BD3C32473E614DAD3A6978643EC49A543E02AA16BEDFDE243D7ADBE9BDE1224FBE6C6826BE5EC4083E7B00783D9E975C3EEF3A1FBDC7AD1DBDCB200EBDE55E993D734D97BC4C6E543E762EE83DBF6DFBBD5F161E3CB941F3BDF40BB2BD4C2E6ABD3B33963D7B3507BE3344553ED50454BD689A233E263939BE1D60D1BB4E3959BE0BAF203C719EC8BD48F3273EFB4B103D7943B0BDA66038BEC91500BE50263FBE52262FBE207BA93D661E823DEC4B56BE58CF3A3E913828BED66E42BE57F2B2BDFFB9503EE2FD4A3E2F23A43C542B493EE2CA34BE2966513EBB4B1FBD391127BEC942A7BD196DD9BD8AC012BD6514C1BC1F9E173EA59780BD511514BD9AB8F4BD339719BEB72639BE38C9D73CB5062F3E895743BD413055BE423D12BC402404BEB4A8463E4EA9F0BD57EA383C2432453C9611263ED58B283E6313C13DC2C95FBEA30B07BE15F2E4BD611C4BBE4FEF053CC73E013E9DF7B5BDED4160BE917CFDBD7FC9523E0A8B643EC300013E966686BDD0DC1ABE05B535BECD8A573E475E283DCECB5EBDFDE8B13D367E3EBEFCD7B43D3C7D24BECE26BD3D48BF20BEF7F0D7BB421F193E85BA77BDCAE272BAB0FD9CBD487E3FBEBBF9603D8B5E24BE27A9C33D95A2063E6DD0BC3DA359173EDA01A5BD505B97BDDEE6713D5283B43DE477DBBC674E2CBED5E3593DBCCB2FBED7D1303EA9753FBE017FC43DA94A9B3DE74A0E3E2FFC273EA596B2BDD8554A3C703E19BE04CF7BBD9155863D65FA1EBE144C573E709798BBF860CA3CC63F44BE"> : tensor<20x20xf32>} : memref<20x20xf32, "cuda">
-    %alloc_5 = memref.alloc() : memref<10x20xf32, "cuda">
-    byre.compute @FillOp(%alloc_5) {device = "cuda", memory_effects = [2 : i32], value = dense<"0xEDAEFD3D6B88963E1051E33DFC7732BE055CEE3C07F9413E080B9CBE9B2F4ABE5608BD3BF8E6DFBD507F46BEC61183BEACE23ABEF010903E824129BEAFB6D83C779721BE953B2B3E8B44CB3BB09383BE2456463EA8E7983ECE3D9EBC690042BEF0D34CBD5AFCAB3DCC1AF13D8E3CF43B0BF2583EC82B583D658D653C79C131BE9AEF24BD85B4B03D46DBAF3DEB4013BD26A9693EB17CC43CEDAF77BEA24E5BBE409C203EA0BE27BCA3380BBEC03C5A3E2775633D62069C3E0DF3963D259F883E7AAD743EEB5AC8BD4B210F3ECE1F303E3D4983BDA3A3F63D3D993FBE868FF1BC89B98ABE13D72E3D5012703E826D35BE725C76BECA748B3ED59EA23D63F6103E2F3527BE354722BECE329DBE39E496BE46C71CBDC849E83D8BD2ADBD2596A73DF2DA0ABED46BA03D752989BDB8624CBE971808BEB1C184BDCC23743E36B4AB3C7D51683E7ADA4B3EDA6EDD3D2DFDF1BDC9A5D5BD948E7E3ECFCE7FBE6E9E813D85DBE43B6543143D379970BEA80F16BED17C18BD05E9193E0F403B3D26EDA43B808C9ABEA6A2823D08D786BE2314AABCD8D783BE562D9ABE2B60943E88C91ABEB2DC513E136825BEBB0DBD3D8E26E7BDF4464EBDD9E68E3E5B5115BE3FCF123E1E194FBE2CACBF3DC71E153D96F24B3E53E9E9BD407141BE714361BE4D2E963DA67B8F3EB62B78BE0D424E3D5EBB7F3D056098BEC5F011BDCB26033E1656DD3C60B90DBE7EFF583DE9C118BDDDF5673E451A58BD8E0C32BE121C05BC8D148E3ED80AEA3DBE27AA3A1F534B3E97023EBD85C231BEEB7CF33BF5FF363D32DCDBBDFBFED83DB640623D66E5EA3D785D2B3E4E7470BE651C673EDC57473E601A24BECF5FFABD38AA14BE9EFC60BCE3E5983E10EE7D3E2BE0453ECE3C41BE0A3B753E4225383D967059BDC197B8BD4FA407BE5C0C17BE0F16953E9884E7BC2E6CF6BDAF01453EB6F466BE37968ABE5BC056BECBD28E3ED9AC493DACC7103D46EB73BEBB39213DFD30F0BD835CD4BD426F4FBD620472BE67FB97BEFBAE8CBDF08F553EFAB77D3EFCC7D3BC76249D3EA2C404BE937B9B3E14197E3D30212D3E262F63BD7899D1BDF72C993E7537DE3D6FD39D3E988D4B3DDB132B3E097461BD8461313C"> : tensor<10x20xf32>} : memref<10x20xf32, "cuda">
-    %0 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%arg0, %alloc_5, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x10xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %1 = "byre.alias"(%alloc) {device = "cuda", offset = 160 : i64} : (memref<320xi8, "cuda">) -> memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_0, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_4, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    byre.compute @PTXOp(%alloc_1, %0, %1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<320xi8, "cuda">) -> memref<2x10xf32, "cuda">
-    byre.compute @MatmulOp_f32f32_f32(%1, %alloc_3, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<2x20xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x10xf32, "cuda">
-    byre.compute @PTXOp(%alloc_2, %2, %arg1) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
+  func.func @forward(%arg0: memref<20x10xf32, "cuda"> {byre.argname = "Weight0", byre.argtype = 4 : i32, byre.weight_value = dense<"0xEDAEFD3D2456463E409C203E725C76BE7D51683ED8D783BEA67B8F3E85C231BE4225383D620472BE6B88963EA8E7983EA0BE27BCCA748B3E7ADA4B3E562D9ABEB62B78BEEB7CF33B967059BD67FB97BE1051E33DCE3D9EBCA3380BBED59EA23DDA6EDD3D2B60943E0D424E3DF5FF363DC197B8BDFBAE8CBDFC7732BE690042BEC03C5A3E63F6103E2DFDF1BD88C91ABE5EBB7F3D32DCDBBD4FA407BEF08F553E055CEE3CF0D34CBD2775633D2F3527BEC9A5D5BDB2DC513E056098BEFBFED83D5C0C17BEFAB77D3E07F9413E5AFCAB3D62069C3E354722BE948E7E3E136825BEC5F011BDB640623D0F16953EFCC7D3BC080B9CBECC1AF13D0DF3963DCE329DBECFCE7FBEBB0DBD3DCB26033E66E5EA3D9884E7BC76249D3E9B2F4ABE8E3CF43B259F883E39E496BE6E9E813D8E26E7BD1656DD3C785D2B3E2E6CF6BDA2C404BE5608BD3B0BF2583E7AAD743E46C71CBD85DBE43BF4464EBD60B90DBE4E7470BEAF01453E937B9B3EF8E6DFBDC82B583DEB5AC8BDC849E83D6543143DD9E68E3E7EFF583D651C673EB6F466BE14197E3D507F46BE658D653C4B210F3E8BD2ADBD379970BE5B5115BEE9C118BDDC57473E37968ABE30212D3EC61183BE79C131BECE1F303E2596A73DA80F16BE3FCF123EDDF5673E601A24BE5BC056BE262F63BDACE23ABE9AEF24BD3D4983BDF2DA0ABED17C18BD1E194FBE451A58BDCF5FFABDCBD28E3E7899D1BDF010903E85B4B03DA3A3F63DD46BA03D05E9193E2CACBF3D8E0C32BE38AA14BED9AC493DF72C993E824129BE46DBAF3D3D993FBE752989BD0F403B3DC71E153D121C05BC9EFC60BCACC7103D7537DE3DAFB6D83CEB4013BD868FF1BCB8624CBE26EDA43B96F24B3E8D148E3EE3E5983E46EB73BE6FD39D3E779721BE26A9693E89B98ABE971808BE808C9ABE53E9E9BDD80AEA3D10EE7D3EBB39213D988D4B3D953B2B3EB17CC43C13D72E3DB1C184BDA6A2823D407141BEBE27AA3A2BE0453EFD30F0BDDB132B3E8B44CB3BEDAF77BE5012703ECC23743E08D786BE714361BE1F534B3ECE3C41BE835CD4BD097461BDB09383BEA24E5BBE826D35BE36B4AB3C2314AABC4D2E963D97023EBD0A3B753E426F4FBD8461313C"> : tensor<20x10xf32>}, %arg1: memref<20xf32, "cuda"> {byre.argname = "Weight1", byre.argtype = 4 : i32, byre.weight_value = dense<[0.101879634, -0.178835288, -0.0953023583, -0.0698504745, -0.19658649, -0.297641844, -0.223349303, 0.168986112, 9.007710e-02, 0.101534814, -0.0601868108, -0.0958566219, -0.243612081, 0.198881537, -0.293788224, -0.240900397, 0.184188008, 0.210917979, 0.121171109, -0.155078679]> : tensor<20xf32>}, %arg2: memref<20x20xf32, "cuda"> {byre.argname = "Weight2", byre.argtype = 4 : i32, byre.weight_value = dense<"0x32A148BE6D99D1BD3C565A3E614360BC1E9BDF3B265A9D3DBD5F18BE8FE25B3EC201E4BC085E783CA941FFBDD2D3953D6BA636BCD3A570BDBF6DFBBD50263FBE6514C1BCC2C95FBEFCD7B43DE477DBBC93869BBDBEED34BD62E5843DC40D4F3EFF38F73C45B35D3EDE0A0EBECC6652BE35B79BBD9044093DF13E123EC271633D17C3273E3C32473E5F161E3C52262FBE1F9E173EA30B07BE3C7D24BE674E2CBEDF0E313EA2CFA0BCA45CAF3C5A313E3EC33C35BD112654BBEDE673BD20FF4CBE09665BBE916844BE4C305C3E6BF8053DBE032E3E614DAD3AB941F3BD207BA93DA59780BD15F2E4BDCE26BD3DD5E3593DADAE1EBDBFCE46BE41EC6DBB34D6FDBD40D157BE8BEEB6BD23F447BED78823BD2F2FC8BD162F39BE5C838CBDA6DFECBD34F4173C6978643EF40BB2BD661E823D511514BD611C4BBE48BF20BEBCCB2FBE5C52443E85B19BBCE2019E3D9C3519BEDD478A3CBD70993DB37F3BBE32E5003EBEFA293E955128BEF44A6EBD5728953D54B10A3EC49A543E4C2E6ABDEC4B56BE9AB8F4BD4FEF053CF7F0D7BBD7D1303E3506E7BB4D7659BE1D3A013E72500C3DD1E6373E7DDE62BE052B503E392A323D668CEDBDB6F210BD2A505EBEB72E1DBD9452EB3D02AA16BE3B33963D58CF3A3E339719BEC73E013E421F193EA9753FBEF2CFB3BD398863BE42C74CBE1B39AABD436BEEBDDB8017BE5683343EE1BAA13D0F397ABDA4C90BBDF0BFA83A2B345DBD82FDD33DDFDE243D7B3507BE913828BEB72639BE9DF7B5BD85BA77BD017FC43D4CF5A13DC7A3DD3D4E3CE63D9B2983BD04BE2E3EE2AC4FBE13D6283EFB3A9BBDC1675D3ECD44643E8BC9E03D7053E83C005014BE7ADBE9BD3344553ED66E42BE38C9D73CED4160BECAE272BAA94A9B3DBC209B3D590C1ABE734E60BE110010BDBD5F2B3D9F8E213EC693A03D9FF356BDC0F1E03D511B3DBED0DA303DE305B1BD578D51BDE1224FBED50454BD57F2B2BDB5062F3E917CFDBDB0FD9CBDE74A0E3E697E213D4A942DBE530B01BEAFAD27BC1D7813BE592C1B3E751BB5BD3C4FE13D164A5FBD8941593E64522C3EF2F3013EF3E116BE6C6826BE689A233EFFB9503E895743BD7FC9523E487E3FBE2FFC273E4A14543D9EB929BECD1844BEEAFFB5BD0EF101BE809B3DBEF94F243EFDFA51BDAA43353C30FEC73C58A96FBD0DE876BD4EDE0F3E5EC4083E263939BEE2FD4A3E413055BE0A8B643EBBF9603DA596B2BDAB303B3E6A0D20BEEC38BE3C4A82103D0369A1BD7DF30B3E5014C23CB60E2C3E19AC2FBEBEC5D73D7211593E8F1098BD47BF58BE7B00783D1D60D1BB2F23A43C423D12BCC300013E8B5E24BED8554A3CB6C1FA3B941C90BD19E862BE8E7AD13C8FF8D93DD463133D003410BEBEE61D3E1A2E90BC8FF2A2BDC3BEA6BC7232E23DC35E793D9E975C3E4E3959BE542B493E402404BE966686BD27A9C33D703E19BE3524DBBC097A76BCFC6F0DBED2693FBE829D3A3E672DEB3DEBBD613EC5EFDA3D3881E6BDCB4F4B3E14EF8ABD4CDE0C3D7195303EEF3A1FBD0BAF203CE2CA34BEB4A8463ED0DC1ABE95A2063E04CF7BBDE1C81ABE0640443EE58D4F3D3E6125BE85CB2EBCEB5A44BECADD55BEE721A0BDC3EA5EBE4F6DC2BBFE5949BEAF43A93DD3A822BEC7AD1DBD719EC8BD2966513E4EA9F0BD05B535BE6DD0BC3D9155863D9DDF51BEAE5B30BE03A7AA3D03FD743D6A2B023EFFA2B03C56C2C3BD13E8F3BC550D21BE2249463E43880FBE1A34DE3DA331963ACB200EBD48F3273EBB4B1FBD57EA383CCD8A573EA359173E65FA1EBEBF11173E723C573E1DCF543C3B3825BEEF8C273E894A33BE8922313D0C34F5BDB6B3023E35888ABC46CE16BDFB1247BD9CF9403EE55E993DFB4B103D391127BE2432453C475E283DDA01A5BD144C573EFFDA4C3EC0EE403E756D503EB7FCD53DB6F3C2BD8A2935BEA41CF7BD3E911D3E1E97B0BD80299C3D097B633D569B3F3E1FD32DBD734D97BC7943B0BDC942A7BD9611263ECECB5EBD505B97BD709798BB6E8A2DBE0F2024BE81D23BBECCDA58BE27F955BD93A6223E0CF8D5BDFD7F38BD500C6DBD70BAFEBD7775193E09619BBD4EFCBB3D4C6E543EA66038BE196DD9BDD58B283EFDE8B13DDEE6713DF860CA3CF9F5013E3D24D53DE0D914BE2B3A4CBEAE3B4ABCE10741BE13AA29BE3C54933D9577BB3D40F617BEB7E90F3E54502A3EB47251BD762EE83DC91500BE8AC012BD6313C13D367E3EBE5283B43DC63F44BE"> : tensor<20x20xf32>}, %arg3: memref<20xf32, "cuda"> {byre.argname = "Weight3", byre.argtype = 4 : i32, byre.weight_value = dense<[0.124238588, -0.0375917405, -0.178324029, 2.1018261E-4, -0.0708629936, 0.179958493, 0.201986402, -0.0302014686, -0.0842267424, 0.0796111747, 0.0201944318, -0.183529228, -0.133614406, -0.0192934573, 0.193412527, 0.219010666, -0.0464102961, 0.00334274326, -0.0029087835, 0.0903228372]> : tensor<20xf32>}, %arg4: memref<10x20xf32, "cuda"> {byre.argname = "Weight4", byre.argtype = 4 : i32, byre.weight_value = dense<"0xD60ED23DDEFF3B3DFBC1653B7AFAC43DAB54263EDA0774BD602A03BE747C53BEC754B93D306D813DBFD7253E96216E3CEA885CBCB90511BEA1F012BEDCF4F23DAC1B29BD128D2ABD6024AD3D838E9CBD6677593E5270AF3D77D0473E933CF8BDC31061BE37FD1E3D522F3FBEC1814A3EC9CB88BCE33C16BECFBD0D3E0351123E19FFD9BD36A5443ED3483E3EA4D8C3BDDEB4CD3D46CA34BD0D0EE9BDA171FA3DEDD4253D461635BC7920B43CFE637BBDF81913BE1EB44BBE9C1DDC3DA77B363EB5D84DBE39B00EBEDC6CD83D034807BE8B68F83C86AA713CB3F664BE4877DFBC6C5006BEDDA5BABDBD5A80BCDCED3EBE579F523E07D5B8BD2135C43C21BE53BE3E001C3E38E409BE486F1EBE551E113EF14E443E946CD7BDA6EA563D0929E7BD6FE3B5BC3698A3BCEB093DBE1B13303B61B08FBCE0C404BD7FA6E83D9FEDA6BD952583BDB82761BE1780C2BBADCA233E4D7A0E3E994A1FBEB99D21BE65E5673DD54E1EBEB97CC9BD9238C53CD3E3453DC302633EB129A93DEF2FB7BC60330D3E5AD9DA3B533F223D400BFBBDAD78A8BD454F62BE33120CBED145D43C250529BDE4CE4EBE74FFDB3DE05813BE0EDAC4BD659C343E08A411BD1405B2BDCD388B3D736E193E517218BE3243F6BCCA5569BD80340EBE3072B4BD616438BED19702BD2C2AB53D692C35BC15B1203D4576FABD2119CFBDBB05A93D074943BEC9732E3D349F62BEC933103E4948D83C68C034BCD8D1503EA7BB51BE2EFA283E97A090BD933AD83D7D7C413E34CBC73D18B3043E827CF4BDDB70183EAAD3623C2B7B84BCC6B6123E29209A3D4233E33CD45C3C3EF5BBD73DDE725FBE0A72B03DE7650E3EBEB64B3ECA19393EB41EF5BD3A5E9C3D6334E93CB831DABD4769D73D39CF873C7F4D41BDED6F4D3E9A1808BECCB48FBCB3CA01BE756B01BD32BBFA3D1BCB113E1042FDBDF3C1553DEC8AD93D41C70D3EE0263C3E8948043EF16A573D3565B63DE5F18EBD3359BBBB4E295EBE46E602BC15896C3D271EAD3DCF144D3E84B3D6BDB506BF3DEAA162BD81F31ABE9DD61DBDBFDD4EBD830E3B3CD04F3A3E392C32BEAC4E223D5DE08BBD19E6E83DE54F6BBDCC3D73BD0A69993DECBE843CD0CBC33D"> : tensor<10x20xf32>}, %arg5: memref<10xf32, "cuda"> {byre.argname = "Weight5", byre.argtype = 4 : i32, byre.weight_value = dense<[0.0670170113, 0.0825609341, -0.125343189, -0.0073415176, -0.100303039, -0.214000896, 0.114002995, 0.21737574, 0.166609675, -0.119800359]> : tensor<10xf32>}, %arg6: memref<2x10xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg7: memref<2x10xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
+    %alloc = memref.alloc() : memref<512xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%arg6, %arg0, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x10xf32, "cuda">, memref<20x10xf32, "cuda">, memref<2x20xf32, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 256 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg1, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg2, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<20x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    byre.compute @PTXOp(%arg3, %0, %1) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<20xf32, "cuda">, memref<2x20xf32, "cuda">, memref<2x20xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<512xi8, "cuda">) -> memref<2x10xf32, "cuda">
+    byre.compute @MatmulOp_f32f32_f32(%1, %arg4, %2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<2x20xf32, "cuda">, memref<10x20xf32, "cuda">, memref<2x10xf32, "cuda">
+    byre.compute @PTXOp(%arg5, %2, %arg7) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown2", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<10xf32, "cuda">, memref<2x10xf32, "cuda">, memref<2x10xf32, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir b/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir
index fe79d2c0d..31ce4b352 100644
--- a/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir
+++ b/compiler/test/E2E/ResNet18/BW/10b_ptx_codegen.mlir
@@ -4,7 +4,7 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    llvm.func @Unknown99(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown96(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -22,207 +22,56 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown98(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown97(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(131072 : index) : i64
-      %19 = llvm.mlir.constant(256 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(131072 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
       %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %45 = llvm.load %44 : !llvm.ptr -> f16
-      %46 = llvm.fpext %45 : f16 to f32
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %46, %47 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(256 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %47 = llvm.load %46 : !llvm.ptr -> f16
+      %48 = llvm.fpext %47 : f16 to f32
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %48, %56 : f32, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown96(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown95(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -242,68 +91,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(4608 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown95(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown94(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -323,230 +162,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(1179648 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown94(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown93(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(2304 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown92(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown91(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -564,45 +231,56 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(32768 : index) : i64
-      %19 = llvm.mlir.constant(128 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(32768 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
       %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %45 = llvm.load %44 : !llvm.ptr -> f16
-      %46 = llvm.fpext %45 : f16 to f32
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %46, %47 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(128 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %47 = llvm.load %46 : !llvm.ptr -> f16
+      %48 = llvm.fpext %47 : f16 to f32
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %48, %56 : f32, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown91(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown90(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -622,68 +300,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(2304 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown90(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown89(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -703,691 +371,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(294912 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown89(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown88(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown87(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(8192 : index) : i64
-      %19 = llvm.mlir.constant(64 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
-      %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %45 = llvm.load %44 : !llvm.ptr -> f16
-      %46 = llvm.fpext %45 : f16 to f32
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %46, %47 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown86(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown85(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(73728 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown84(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown83(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown82(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(1152 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown81(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown86(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -1402,422 +437,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown80(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.mlir.constant(0 : index) : i64
-      %10 = llvm.mlir.constant(512000 : index) : i64
-      %11 = llvm.mlir.constant(512 : index) : i64
-      %12 = llvm.mlir.constant(-1 : index) : i64
-      %13 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %14 = llvm.sext %13 : i32 to i64
-      %15 = nvvm.read.ptx.sreg.ntid.x : i32
-      %16 = llvm.sext %15 : i32 to i64
-      %17 = nvvm.read.ptx.sreg.tid.x : i32
-      %18 = llvm.sext %17 : i32 to i64
-      %19 = llvm.mul %16, %14  : i64
-      %20 = llvm.add %18, %19  : i64
-      %21 = llvm.icmp "slt" %20, %10 : i64
-      llvm.cond_br %21, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %22 = llvm.srem %20, %11  : i64
-      %23 = llvm.icmp "slt" %22, %9 : i64
-      %24 = llvm.add %22, %11  : i64
-      %25 = llvm.select %23, %24, %22 : i1, i64
-      %26 = llvm.icmp "slt" %20, %9 : i64
-      %27 = llvm.sub %12, %20  : i64
-      %28 = llvm.select %26, %27, %20 : i1, i64
-      %29 = llvm.sdiv %28, %11  : i64
-      %30 = llvm.sub %12, %29  : i64
-      %31 = llvm.select %26, %30, %29 : i1, i64
-      %32 = llvm.mul %31, %11  : i64
-      %33 = llvm.add %32, %25  : i64
-      %34 = llvm.getelementptr %arg1[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %35 = llvm.load %34 : !llvm.ptr -> f16
-      %36 = llvm.fpext %35 : f16 to f32
-      %37 = llvm.getelementptr %arg8[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %36, %37 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown79(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.mlir.constant(1000 : index) : i64
-      %6 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %7 = llvm.sext %6 : i32 to i64
-      %8 = nvvm.read.ptx.sreg.ntid.x : i32
-      %9 = llvm.sext %8 : i32 to i64
-      %10 = nvvm.read.ptx.sreg.tid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = llvm.mul %9, %7  : i64
-      %13 = llvm.add %11, %12  : i64
-      %14 = llvm.icmp "slt" %13, %5 : i64
-      llvm.cond_br %14, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %15 = llvm.getelementptr %arg1[%13] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %16 = llvm.load %15 : !llvm.ptr -> f32
-      %17 = llvm.fptrunc %16 : f32 to f16
-      %18 = llvm.fpext %17 : f16 to f32
-      %19 = llvm.getelementptr %arg6[%13] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %18, %19 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown78(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.mlir.constant(0 : index) : i64
-      %10 = llvm.mlir.constant(1000 : index) : i64
-      %11 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %12 = llvm.sext %11 : i32 to i64
-      %13 = nvvm.read.ptx.sreg.ntid.x : i32
-      %14 = llvm.sext %13 : i32 to i64
-      %15 = nvvm.read.ptx.sreg.tid.x : i32
-      %16 = llvm.sext %15 : i32 to i64
-      %17 = llvm.mul %14, %12  : i64
-      %18 = llvm.add %16, %17  : i64
-      %19 = llvm.icmp "slt" %18, %10 : i64
-      llvm.cond_br %19, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %20 = llvm.mul %9, %10  : i64
-      %21 = llvm.add %20, %18  : i64
-      %22 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %23 = llvm.load %22 : !llvm.ptr -> f16
-      %24 = llvm.fpext %23 : f16 to f32
-      %25 = llvm.getelementptr %arg8[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %24, %25 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown77(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(9408 : index) : i64
-      %19 = llvm.mlir.constant(7 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(3 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(147 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(49 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fpext %70 : f16 to f32
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %71, %72 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown74(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(802816 : index) : i64
-      %28 = llvm.mlir.constant(112 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(12544 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown73(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0 : index) : i64
-      %26 = llvm.mlir.constant(200704 : index) : i64
-      %27 = llvm.mlir.constant(56 : index) : i64
-      %28 = llvm.mlir.constant(-1 : index) : i64
-      %29 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %30 = llvm.sext %29 : i32 to i64
-      %31 = nvvm.read.ptx.sreg.ntid.x : i32
-      %32 = llvm.sext %31 : i32 to i64
-      %33 = nvvm.read.ptx.sreg.tid.x : i32
-      %34 = llvm.sext %33 : i32 to i64
-      %35 = llvm.mul %32, %30  : i64
-      %36 = llvm.add %34, %35  : i64
-      %37 = llvm.icmp "slt" %36, %26 : i64
-      llvm.cond_br %37, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %38 = llvm.srem %36, %27  : i64
-      %39 = llvm.icmp "slt" %38, %25 : i64
-      %40 = llvm.add %38, %27  : i64
-      %41 = llvm.select %39, %40, %38 : i1, i64
-      %42 = llvm.icmp "slt" %36, %25 : i64
-      %43 = llvm.sub %28, %36  : i64
-      %44 = llvm.select %42, %43, %36 : i1, i64
-      %45 = llvm.sdiv %44, %27  : i64
-      %46 = llvm.sub %28, %45  : i64
-      %47 = llvm.select %42, %46, %45 : i1, i64
-      %48 = llvm.srem %47, %27  : i64
-      %49 = llvm.icmp "slt" %48, %25 : i64
-      %50 = llvm.add %48, %27  : i64
-      %51 = llvm.select %49, %50, %48 : i1, i64
-      %52 = llvm.icmp "slt" %47, %25 : i64
-      %53 = llvm.sub %28, %47  : i64
-      %54 = llvm.select %52, %53, %47 : i1, i64
-      %55 = llvm.sdiv %54, %27  : i64
-      %56 = llvm.sub %28, %55  : i64
-      %57 = llvm.select %52, %56, %55 : i1, i64
-      %58 = llvm.mul %25, %26  : i64
-      %59 = llvm.mlir.constant(3136 : index) : i64
-      %60 = llvm.mul %57, %59  : i64
-      %61 = llvm.add %58, %60  : i64
-      %62 = llvm.mul %51, %27  : i64
-      %63 = llvm.add %61, %62  : i64
-      %64 = llvm.add %63, %41  : i64
-      %65 = llvm.getelementptr %arg1[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %66 = llvm.load %65 : !llvm.ptr -> f16
-      %67 = llvm.getelementptr %arg12[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %68 = llvm.load %67 : !llvm.ptr -> f16
-      %69 = llvm.fadd %66, %68  : f16
-      %70 = llvm.getelementptr %arg23[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %69, %70 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %17 = llvm.mlir.constant(8192 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %17 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(64 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %47 = llvm.load %46 : !llvm.ptr -> f16
+      %48 = llvm.fpext %47 : f16 to f32
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %48, %56 : f32, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown69(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown85(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -1835,70 +507,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(200704 : index) : i64
-      %28 = llvm.mlir.constant(56 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(3136 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %17 = llvm.mlir.constant(0 : index) : i64
+      %18 = llvm.mlir.constant(147456 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(1152 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown65(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown84(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -1916,81 +578,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(200704 : index) : i64
-      %36 = llvm.mlir.constant(56 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
-      %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(3136 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %17 = llvm.mlir.constant(0 : index) : i64
+      %18 = llvm.mlir.constant(73728 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(576 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown61(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown80(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2008,70 +649,159 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(200704 : index) : i64
-      %28 = llvm.mlir.constant(56 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(3136 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %17 = llvm.mlir.constant(0 : index) : i64
+      %18 = llvm.mlir.constant(36864 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(576 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown57(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown79(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %9 = llvm.mlir.constant(0 : index) : i64
+      %10 = llvm.mlir.constant(512000 : index) : i64
+      %11 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %12 = llvm.sext %11 : i32 to i64
+      %13 = nvvm.read.ptx.sreg.ntid.x : i32
+      %14 = llvm.sext %13 : i32 to i64
+      %15 = nvvm.read.ptx.sreg.tid.x : i32
+      %16 = llvm.sext %15 : i32 to i64
+      %17 = llvm.mul %14, %12  : i64
+      %18 = llvm.add %16, %17  : i64
+      %19 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = llvm.mul %14, %20  : i64
+      llvm.br ^bb1(%18 : i64)
+    ^bb1(%22: i64):  // 2 preds: ^bb0, ^bb2
+      %23 = llvm.icmp "slt" %22, %10 : i64
+      llvm.cond_br %23, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %26 = llvm.mlir.constant(1 : index) : i64
+      %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %28 = llvm.mlir.constant(512 : index) : i64
+      %29 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %30 = llvm.mul %9, %28  : i64
+      %31 = llvm.add %30, %9  : i64
+      %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %33 = llvm.load %32 : !llvm.ptr -> f16
+      %34 = llvm.fpext %33 : f16 to f32
+      %35 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %36 = llvm.insertvalue %26, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %37 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %34, %38 : f32, !llvm.ptr
+      %39 = llvm.add %22, %21  : i64
+      llvm.br ^bb1(%39 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown78(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %9 = llvm.mlir.constant(0 : index) : i64
+      %10 = llvm.mlir.constant(1000 : index) : i64
+      %11 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %12 = llvm.sext %11 : i32 to i64
+      %13 = nvvm.read.ptx.sreg.ntid.x : i32
+      %14 = llvm.sext %13 : i32 to i64
+      %15 = nvvm.read.ptx.sreg.tid.x : i32
+      %16 = llvm.sext %15 : i32 to i64
+      %17 = llvm.mul %14, %12  : i64
+      %18 = llvm.add %16, %17  : i64
+      %19 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = llvm.mul %14, %20  : i64
+      llvm.br ^bb1(%18 : i64)
+    ^bb1(%22: i64):  // 2 preds: ^bb0, ^bb2
+      %23 = llvm.icmp "slt" %22, %10 : i64
+      llvm.cond_br %23, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %26 = llvm.mlir.constant(1 : index) : i64
+      %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %28 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %29 = llvm.mul %9, %10  : i64
+      %30 = llvm.add %29, %9  : i64
+      %31 = llvm.getelementptr %28[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %32 = llvm.load %31 : !llvm.ptr -> f16
+      %33 = llvm.fpext %32 : f16 to f32
+      %34 = llvm.fptrunc %33 : f32 to f16
+      %35 = llvm.fpext %34 : f16 to f32
+      %36 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %37 = llvm.insertvalue %26, %36[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %38 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %39 = llvm.getelementptr %38[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %35, %39 : f32, !llvm.ptr
+      %40 = llvm.add %22, %21  : i64
+      llvm.br ^bb1(%40 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown77(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2089,81 +819,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(200704 : index) : i64
-      %36 = llvm.mlir.constant(56 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
-      %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(3136 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %17 = llvm.mlir.constant(0 : index) : i64
+      %18 = llvm.mlir.constant(9408 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(147 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(49 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(7 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.fpext %51 : f16 to f32
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %52, %60 : f32, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown50(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown74(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2189,62 +898,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(100352 : index) : i64
-      %28 = llvm.mlir.constant(28 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(802816 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(784 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(12544 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(112 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fcmp "ogt" %59, %26 : f16
+      %70 = llvm.select %69, %68, %26 : i1, f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown46(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown73(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2270,73 +987,68 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.insertvalue %arg33, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %26 = llvm.insertvalue %arg34, %25[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %27 = llvm.insertvalue %arg35, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %28 = llvm.insertvalue %arg36, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %29 = llvm.insertvalue %arg40, %28[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(100352 : index) : i64
-      %36 = llvm.mlir.constant(28 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
-      %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(784 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %25 = llvm.mlir.constant(200704 : index) : i64
+      %26 = llvm.mlir.constant(0 : index) : i64
+      %27 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = nvvm.read.ptx.sreg.ntid.x : i32
+      %30 = llvm.sext %29 : i32 to i64
+      %31 = nvvm.read.ptx.sreg.tid.x : i32
+      %32 = llvm.sext %31 : i32 to i64
+      %33 = llvm.mul %30, %28  : i64
+      %34 = llvm.add %32, %33  : i64
+      %35 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %36 = llvm.sext %35 : i32 to i64
+      %37 = llvm.mul %30, %36  : i64
+      llvm.br ^bb1(%34 : i64)
+    ^bb1(%38: i64):  // 2 preds: ^bb0, ^bb2
+      %39 = llvm.icmp "slt" %38, %25 : i64
+      llvm.cond_br %39, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %40 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %41 = llvm.insertvalue %38, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(1 : index) : i64
+      %43 = llvm.insertvalue %42, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %44 = llvm.insertvalue %25, %43[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %42, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.mlir.constant(3136 : index) : i64
+      %47 = llvm.insertvalue %46, %45[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %48 = llvm.insertvalue %42, %47[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.mlir.constant(56 : index) : i64
+      %50 = llvm.getelementptr %arg1[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.mul %26, %25  : i64
+      %52 = llvm.mul %26, %46  : i64
+      %53 = llvm.add %51, %52  : i64
+      %54 = llvm.mul %26, %49  : i64
+      %55 = llvm.add %53, %54  : i64
+      %56 = llvm.add %55, %26  : i64
+      %57 = llvm.getelementptr %50[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %58 = llvm.load %57 : !llvm.ptr -> f16
+      %59 = llvm.insertvalue %38, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %60 = llvm.insertvalue %42, %59[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %25, %60[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %42, %61[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %46, %62[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %42, %63[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.getelementptr %arg12[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %66 = llvm.getelementptr %65[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.load %66 : !llvm.ptr -> f16
+      %68 = llvm.fadd %58, %67  : f16
+      %69 = llvm.insertvalue %38, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %70 = llvm.insertvalue %42, %69[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %25, %70[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %42, %71[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %46, %72[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %42, %73[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.getelementptr %arg23[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %76 = llvm.getelementptr %75[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %68, %76 : f16, !llvm.ptr
+      %77 = llvm.add %38, %37  : i64
+      llvm.br ^bb1(%77 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown42(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown61(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2362,62 +1074,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(100352 : index) : i64
-      %28 = llvm.mlir.constant(28 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(200704 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(784 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(3136 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(56 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fcmp "ogt" %59, %26 : f16
+      %70 = llvm.select %69, %68, %26 : i1, f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown38(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown57(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2451,65 +1171,80 @@ module attributes {byre.container_module, gpu.container_module} {
       %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(100352 : index) : i64
-      %36 = llvm.mlir.constant(28 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %33 = llvm.mlir.constant(200704 : index) : i64
+      %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %35 = llvm.mlir.constant(0 : index) : i64
+      %36 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = nvvm.read.ptx.sreg.ntid.x : i32
       %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
+      %40 = nvvm.read.ptx.sreg.tid.x : i32
       %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(784 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %42 = llvm.mul %39, %37  : i64
+      %43 = llvm.add %41, %42  : i64
+      %44 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %45 = llvm.sext %44 : i32 to i64
+      %46 = llvm.mul %39, %45  : i64
+      llvm.br ^bb1(%43 : i64)
+    ^bb1(%47: i64):  // 2 preds: ^bb0, ^bb2
+      %48 = llvm.icmp "slt" %47, %33 : i64
+      llvm.cond_br %48, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.mlir.constant(1 : index) : i64
+      %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.mlir.constant(3136 : index) : i64
+      %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.mlir.constant(56 : index) : i64
+      %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.mul %35, %33  : i64
+      %61 = llvm.mul %35, %55  : i64
+      %62 = llvm.add %60, %61  : i64
+      %63 = llvm.mul %35, %58  : i64
+      %64 = llvm.add %62, %63  : i64
+      %65 = llvm.add %64, %35  : i64
+      %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.load %66 : !llvm.ptr -> f16
+      %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %76 = llvm.load %75 : !llvm.ptr -> f16
+      %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %85 = llvm.load %84 : !llvm.ptr -> f16
+      %86 = llvm.fadd %67, %76  : f16
+      %87 = llvm.fcmp "ogt" %85, %34 : f16
+      %88 = llvm.select %87, %86, %34 : i1, f16
+      %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %88, %96 : f16, !llvm.ptr
+      %97 = llvm.add %47, %46  : i64
+      llvm.br ^bb1(%97 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown31(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown42(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2535,62 +1270,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(50176 : index) : i64
-      %28 = llvm.mlir.constant(14 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(100352 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(196 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(784 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(28 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fcmp "ogt" %59, %26 : f16
+      %70 = llvm.select %69, %68, %26 : i1, f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown27(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown38(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2624,65 +1367,80 @@ module attributes {byre.container_module, gpu.container_module} {
       %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(50176 : index) : i64
-      %36 = llvm.mlir.constant(14 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %33 = llvm.mlir.constant(100352 : index) : i64
+      %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %35 = llvm.mlir.constant(0 : index) : i64
+      %36 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = nvvm.read.ptx.sreg.ntid.x : i32
       %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
+      %40 = nvvm.read.ptx.sreg.tid.x : i32
       %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(196 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %42 = llvm.mul %39, %37  : i64
+      %43 = llvm.add %41, %42  : i64
+      %44 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %45 = llvm.sext %44 : i32 to i64
+      %46 = llvm.mul %39, %45  : i64
+      llvm.br ^bb1(%43 : i64)
+    ^bb1(%47: i64):  // 2 preds: ^bb0, ^bb2
+      %48 = llvm.icmp "slt" %47, %33 : i64
+      llvm.cond_br %48, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.mlir.constant(1 : index) : i64
+      %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.mlir.constant(784 : index) : i64
+      %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.mlir.constant(28 : index) : i64
+      %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.mul %35, %33  : i64
+      %61 = llvm.mul %35, %55  : i64
+      %62 = llvm.add %60, %61  : i64
+      %63 = llvm.mul %35, %58  : i64
+      %64 = llvm.add %62, %63  : i64
+      %65 = llvm.add %64, %35  : i64
+      %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.load %66 : !llvm.ptr -> f16
+      %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %76 = llvm.load %75 : !llvm.ptr -> f16
+      %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %85 = llvm.load %84 : !llvm.ptr -> f16
+      %86 = llvm.fadd %67, %76  : f16
+      %87 = llvm.fcmp "ogt" %85, %34 : f16
+      %88 = llvm.select %87, %86, %34 : i1, f16
+      %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %88, %96 : f16, !llvm.ptr
+      %97 = llvm.add %47, %46  : i64
+      llvm.br ^bb1(%97 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown23(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown23(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2708,62 +1466,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(50176 : index) : i64
-      %28 = llvm.mlir.constant(14 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(50176 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(196 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(196 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(14 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fcmp "ogt" %59, %26 : f16
+      %70 = llvm.select %69, %68, %26 : i1, f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown19(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown19(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2797,146 +1563,80 @@ module attributes {byre.container_module, gpu.container_module} {
       %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(50176 : index) : i64
-      %36 = llvm.mlir.constant(14 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %33 = llvm.mlir.constant(50176 : index) : i64
+      %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %35 = llvm.mlir.constant(0 : index) : i64
+      %36 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = nvvm.read.ptx.sreg.ntid.x : i32
       %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
+      %40 = nvvm.read.ptx.sreg.tid.x : i32
       %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(196 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown12(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(25088 : index) : i64
-      %28 = llvm.mlir.constant(7 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(49 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
+      %42 = llvm.mul %39, %37  : i64
+      %43 = llvm.add %41, %42  : i64
+      %44 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %45 = llvm.sext %44 : i32 to i64
+      %46 = llvm.mul %39, %45  : i64
+      llvm.br ^bb1(%43 : i64)
+    ^bb1(%47: i64):  // 2 preds: ^bb0, ^bb2
+      %48 = llvm.icmp "slt" %47, %33 : i64
+      llvm.cond_br %48, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.mlir.constant(1 : index) : i64
+      %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.mlir.constant(196 : index) : i64
+      %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.mlir.constant(14 : index) : i64
+      %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.mul %35, %33  : i64
+      %61 = llvm.mul %35, %55  : i64
+      %62 = llvm.add %60, %61  : i64
+      %63 = llvm.mul %35, %58  : i64
       %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %65 = llvm.add %64, %35  : i64
+      %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
       %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %76 = llvm.load %75 : !llvm.ptr -> f16
+      %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %85 = llvm.load %84 : !llvm.ptr -> f16
+      %86 = llvm.fadd %67, %76  : f16
+      %87 = llvm.fcmp "ogt" %85, %34 : f16
+      %88 = llvm.select %87, %86, %34 : i1, f16
+      %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %88, %96 : f16, !llvm.ptr
+      %97 = llvm.add %47, %46  : i64
+      llvm.br ^bb1(%97 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown8(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr, %arg34: !llvm.ptr, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown8(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: !llvm.ptr {llvm.noalias}, %arg34: !llvm.ptr {llvm.noalias}, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: i64, %arg39: i64, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2970,65 +1670,80 @@ module attributes {byre.container_module, gpu.container_module} {
       %30 = llvm.insertvalue %arg37, %29[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %31 = llvm.insertvalue %arg41, %30[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %32 = llvm.insertvalue %arg38, %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %33 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %34 = llvm.mlir.constant(0 : index) : i64
-      %35 = llvm.mlir.constant(25088 : index) : i64
-      %36 = llvm.mlir.constant(7 : index) : i64
-      %37 = llvm.mlir.constant(-1 : index) : i64
-      %38 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %33 = llvm.mlir.constant(25088 : index) : i64
+      %34 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %35 = llvm.mlir.constant(0 : index) : i64
+      %36 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = nvvm.read.ptx.sreg.ntid.x : i32
       %39 = llvm.sext %38 : i32 to i64
-      %40 = nvvm.read.ptx.sreg.ntid.x : i32
+      %40 = nvvm.read.ptx.sreg.tid.x : i32
       %41 = llvm.sext %40 : i32 to i64
-      %42 = nvvm.read.ptx.sreg.tid.x : i32
-      %43 = llvm.sext %42 : i32 to i64
-      %44 = llvm.mul %41, %39  : i64
-      %45 = llvm.add %43, %44  : i64
-      %46 = llvm.icmp "slt" %45, %35 : i64
-      llvm.cond_br %46, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %47 = llvm.srem %45, %36  : i64
-      %48 = llvm.icmp "slt" %47, %34 : i64
-      %49 = llvm.add %47, %36  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %45, %34 : i64
-      %52 = llvm.sub %37, %45  : i64
-      %53 = llvm.select %51, %52, %45 : i1, i64
-      %54 = llvm.sdiv %53, %36  : i64
-      %55 = llvm.sub %37, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.srem %56, %36  : i64
-      %58 = llvm.icmp "slt" %57, %34 : i64
-      %59 = llvm.add %57, %36  : i64
-      %60 = llvm.select %58, %59, %57 : i1, i64
-      %61 = llvm.icmp "slt" %56, %34 : i64
-      %62 = llvm.sub %37, %56  : i64
-      %63 = llvm.select %61, %62, %56 : i1, i64
-      %64 = llvm.sdiv %63, %36  : i64
-      %65 = llvm.sub %37, %64  : i64
-      %66 = llvm.select %61, %65, %64 : i1, i64
-      %67 = llvm.mul %34, %35  : i64
-      %68 = llvm.mlir.constant(49 : index) : i64
-      %69 = llvm.mul %66, %68  : i64
-      %70 = llvm.add %67, %69  : i64
-      %71 = llvm.mul %60, %36  : i64
-      %72 = llvm.add %70, %71  : i64
-      %73 = llvm.add %72, %50  : i64
-      %74 = llvm.getelementptr %arg23[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %75 = llvm.load %74 : !llvm.ptr -> f16
-      %76 = llvm.getelementptr %arg1[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %77 = llvm.load %76 : !llvm.ptr -> f16
-      %78 = llvm.getelementptr %arg12[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %79 = llvm.load %78 : !llvm.ptr -> f16
-      %80 = llvm.fadd %77, %79  : f16
-      %81 = llvm.fcmp "ogt" %75, %33 : f16
-      %82 = llvm.select %81, %80, %33 : i1, f16
-      %83 = llvm.getelementptr %arg34[%73] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %82, %83 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %42 = llvm.mul %39, %37  : i64
+      %43 = llvm.add %41, %42  : i64
+      %44 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %45 = llvm.sext %44 : i32 to i64
+      %46 = llvm.mul %39, %45  : i64
+      llvm.br ^bb1(%43 : i64)
+    ^bb1(%47: i64):  // 2 preds: ^bb0, ^bb2
+      %48 = llvm.icmp "slt" %47, %33 : i64
+      llvm.cond_br %48, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %49 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %50 = llvm.insertvalue %47, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.mlir.constant(1 : index) : i64
+      %52 = llvm.insertvalue %51, %50[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %33, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %51, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.mlir.constant(49 : index) : i64
+      %56 = llvm.insertvalue %55, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %51, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.mlir.constant(7 : index) : i64
+      %59 = llvm.getelementptr %arg1[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.mul %35, %33  : i64
+      %61 = llvm.mul %35, %55  : i64
+      %62 = llvm.add %60, %61  : i64
+      %63 = llvm.mul %35, %58  : i64
+      %64 = llvm.add %62, %63  : i64
+      %65 = llvm.add %64, %35  : i64
+      %66 = llvm.getelementptr %59[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.load %66 : !llvm.ptr -> f16
+      %68 = llvm.insertvalue %47, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %69 = llvm.insertvalue %51, %68[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %70 = llvm.insertvalue %33, %69[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %51, %70[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %55, %71[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %51, %72[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.getelementptr %arg12[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %75 = llvm.getelementptr %74[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %76 = llvm.load %75 : !llvm.ptr -> f16
+      %77 = llvm.insertvalue %47, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %78 = llvm.insertvalue %51, %77[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %79 = llvm.insertvalue %33, %78[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %80 = llvm.insertvalue %51, %79[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %81 = llvm.insertvalue %55, %80[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %82 = llvm.insertvalue %51, %81[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %83 = llvm.getelementptr %arg23[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %84 = llvm.getelementptr %83[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %85 = llvm.load %84 : !llvm.ptr -> f16
+      %86 = llvm.fadd %67, %76  : f16
+      %87 = llvm.fcmp "ogt" %85, %34 : f16
+      %88 = llvm.select %87, %86, %34 : i1, f16
+      %89 = llvm.insertvalue %47, %26[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %90 = llvm.insertvalue %51, %89[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %91 = llvm.insertvalue %33, %90[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %92 = llvm.insertvalue %51, %91[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %93 = llvm.insertvalue %55, %92[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %94 = llvm.insertvalue %51, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %95 = llvm.getelementptr %arg34[%47] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %96 = llvm.getelementptr %95[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %88, %96 : f16, !llvm.ptr
+      %97 = llvm.add %47, %46  : i64
+      llvm.br ^bb1(%97 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown4(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown4(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3054,62 +1769,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(25088 : index) : i64
-      %28 = llvm.mlir.constant(7 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(25088 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(49 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fcmp "ogt" %67, %25 : f16
-      %71 = llvm.select %70, %69, %25 : i1, f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(49 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(7 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fcmp "ogt" %59, %26 : f16
+      %70 = llvm.select %69, %68, %26 : i1, f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
-    llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr, %arg19: !llvm.ptr, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: i64, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr {llvm.noalias}, %arg19: !llvm.ptr {llvm.noalias}, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: i64, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
@@ -3132,64 +1855,69 @@ module attributes {byre.container_module, gpu.container_module} {
       %19 = llvm.insertvalue %arg22, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %20 = llvm.insertvalue %arg26, %19[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %21 = llvm.insertvalue %arg23, %20[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %23 = llvm.mlir.constant(4.900000e+01 : f16) : f16
-      %24 = llvm.mlir.constant(0 : index) : i64
-      %25 = llvm.mlir.constant(25088 : index) : i64
-      %26 = llvm.mlir.constant(7 : index) : i64
-      %27 = llvm.mlir.constant(-1 : index) : i64
-      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %29 = llvm.sext %28 : i32 to i64
-      %30 = nvvm.read.ptx.sreg.ntid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.tid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = llvm.mul %31, %29  : i64
-      %35 = llvm.add %33, %34  : i64
-      %36 = llvm.icmp "slt" %35, %25 : i64
-      llvm.cond_br %36, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %37 = llvm.srem %35, %26  : i64
-      %38 = llvm.icmp "slt" %37, %24 : i64
-      %39 = llvm.add %37, %26  : i64
-      %40 = llvm.select %38, %39, %37 : i1, i64
-      %41 = llvm.icmp "slt" %35, %24 : i64
-      %42 = llvm.sub %27, %35  : i64
-      %43 = llvm.select %41, %42, %35 : i1, i64
-      %44 = llvm.sdiv %43, %26  : i64
-      %45 = llvm.sub %27, %44  : i64
-      %46 = llvm.select %41, %45, %44 : i1, i64
-      %47 = llvm.srem %46, %26  : i64
-      %48 = llvm.icmp "slt" %47, %24 : i64
-      %49 = llvm.add %47, %26  : i64
-      %50 = llvm.select %48, %49, %47 : i1, i64
-      %51 = llvm.icmp "slt" %46, %24 : i64
-      %52 = llvm.sub %27, %46  : i64
-      %53 = llvm.select %51, %52, %46 : i1, i64
-      %54 = llvm.sdiv %53, %26  : i64
-      %55 = llvm.sub %27, %54  : i64
-      %56 = llvm.select %51, %55, %54 : i1, i64
-      %57 = llvm.mul %24, %25  : i64
-      %58 = llvm.mlir.constant(49 : index) : i64
-      %59 = llvm.mul %56, %58  : i64
-      %60 = llvm.add %57, %59  : i64
-      %61 = llvm.mul %50, %26  : i64
-      %62 = llvm.add %60, %61  : i64
-      %63 = llvm.add %62, %40  : i64
-      %64 = llvm.getelementptr %arg8[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %65 = llvm.load %64 : !llvm.ptr -> f16
-      %66 = llvm.mlir.constant(512 : index) : i64
-      %67 = llvm.mul %24, %66  : i64
-      %68 = llvm.add %67, %56  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %70 = llvm.load %69 : !llvm.ptr -> f16
-      %71 = llvm.fdiv %70, %23  : f16
-      %72 = llvm.fcmp "ogt" %65, %22 : f16
-      %73 = llvm.select %72, %71, %22 : i1, f16
-      %74 = llvm.getelementptr %arg19[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %73, %74 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      %22 = llvm.mlir.constant(49 : index) : i64
+      %23 = llvm.mlir.constant(25088 : index) : i64
+      %24 = llvm.mlir.constant(4.900000e+01 : f16) : f16
+      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %26 = llvm.mlir.constant(0 : index) : i64
+      %27 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = nvvm.read.ptx.sreg.ntid.x : i32
+      %30 = llvm.sext %29 : i32 to i64
+      %31 = nvvm.read.ptx.sreg.tid.x : i32
+      %32 = llvm.sext %31 : i32 to i64
+      %33 = llvm.mul %30, %28  : i64
+      %34 = llvm.add %32, %33  : i64
+      %35 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %36 = llvm.sext %35 : i32 to i64
+      %37 = llvm.mul %30, %36  : i64
+      llvm.br ^bb1(%34 : i64)
+    ^bb1(%38: i64):  // 2 preds: ^bb0, ^bb2
+      %39 = llvm.icmp "slt" %38, %23 : i64
+      llvm.cond_br %39, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %40 = llvm.sdiv %38, %22  : i64
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %40, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %45 = llvm.mlir.constant(512 : index) : i64
+      %46 = llvm.getelementptr %arg1[%40] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %47 = llvm.mul %26, %45  : i64
+      %48 = llvm.add %47, %26  : i64
+      %49 = llvm.getelementptr %46[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %50 = llvm.load %49 : !llvm.ptr -> f16
+      %51 = llvm.insertvalue %38, %7[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %43, %51[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %23, %52[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %43, %53[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %22, %54[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %43, %55[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.mlir.constant(7 : index) : i64
+      %58 = llvm.getelementptr %arg8[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.mul %26, %23  : i64
+      %60 = llvm.mul %26, %22  : i64
+      %61 = llvm.add %59, %60  : i64
+      %62 = llvm.mul %26, %57  : i64
+      %63 = llvm.add %61, %62  : i64
+      %64 = llvm.add %63, %26  : i64
+      %65 = llvm.getelementptr %58[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %66 = llvm.load %65 : !llvm.ptr -> f16
+      %67 = llvm.fdiv %50, %24  : f16
+      %68 = llvm.fcmp "ogt" %66, %25 : f16
+      %69 = llvm.select %68, %67, %25 : i1, f16
+      %70 = llvm.insertvalue %38, %15[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %71 = llvm.insertvalue %43, %70[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %23, %71[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %43, %72[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %22, %73[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %43, %74[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.getelementptr %arg19[%38] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %77 = llvm.getelementptr %76[%64] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %69, %77 : f16, !llvm.ptr
+      %78 = llvm.add %38, %37  : i64
+      llvm.br ^bb1(%78 : i64)
+    ^bb3:  // pred: ^bb1
       llvm.return
     }
   }
diff --git a/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir
index 902811a2f..10309ea20 100644
--- a/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/2_linalg_tensor_opt.mlir
@@ -13,10 +13,10 @@ module {
     return %5 : tensor<1x512x7x7xf16>
   }
   func.func private @BatchNormGradOp1(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
+    %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
+    %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
     return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
   }
@@ -37,25 +37,6 @@ module {
     %2 = mhlo.select %1, %arg1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16>
     return %2 : tensor<1x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp5(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp6(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp7(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
   func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16>
@@ -63,39 +44,6 @@ module {
     %3 = mhlo.select %2, %1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16>
     return %3 : tensor<1x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp9(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp10(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp11(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
-    %1 = mhlo.compare  GT, %arg0, %0 : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xi1>
-    %2 = mhlo.select %1, %arg1, %0 : tensor<1x512x7x7xi1>, tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
-  }
-  func.func private @BatchNormGradOp13(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @ConvBackwardDataOp14(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16>
@@ -107,14 +55,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16>
     return %1 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormGradOp16(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @ConvBackwardDataOp17(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<1x256x14x14xf16>
@@ -133,10 +73,10 @@ module {
     return %3 : tensor<1x256x14x14xf16>
   }
   func.func private @BatchNormGradOp20(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
+    %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
+    %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
     return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
   }
@@ -157,65 +97,6 @@ module {
     %2 = mhlo.select %1, %arg1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16>
     return %2 : tensor<1x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp24(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp25(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp26(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16>
-    %2 = mhlo.compare  GT, %arg2, %0 : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xi1>
-    %3 = mhlo.select %2, %1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16>
-    return %3 : tensor<1x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp28(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp29(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp30(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
-    %1 = mhlo.compare  GT, %arg0, %0 : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xi1>
-    %2 = mhlo.select %1, %arg1, %0 : tensor<1x256x14x14xi1>, tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp32(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @ConvBackwardDataOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16>
@@ -227,14 +108,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16>
     return %1 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormGradOp35(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @ConvBackwardDataOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<1x128x28x28xf16>
@@ -253,10 +126,10 @@ module {
     return %3 : tensor<1x128x28x28xf16>
   }
   func.func private @BatchNormGradOp39(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
+    %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
+    %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
     return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
   }
@@ -277,65 +150,6 @@ module {
     %2 = mhlo.select %1, %arg1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16>
     return %2 : tensor<1x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp43(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp44(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp45(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16>
-    %2 = mhlo.compare  GT, %arg2, %0 : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xi1>
-    %3 = mhlo.select %2, %1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16>
-    return %3 : tensor<1x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp47(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp48(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp49(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
-    %1 = mhlo.compare  GT, %arg0, %0 : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xi1>
-    %2 = mhlo.select %1, %arg1, %0 : tensor<1x128x28x28xi1>, tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp51(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @ConvBackwardDataOp52(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16>
@@ -347,14 +161,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16>
     return %1 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormGradOp54(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @ConvBackwardDataOp55(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<1x64x56x56xf16>
@@ -373,10 +179,10 @@ module {
     return %3 : tensor<1x64x56x56xf16>
   }
   func.func private @BatchNormGradOp58(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
+    %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
+    %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
     return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
   }
@@ -397,76 +203,6 @@ module {
     %2 = mhlo.select %1, %arg1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16>
     return %2 : tensor<1x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp62(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp63(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp64(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16>
-    %2 = mhlo.compare  GT, %arg2, %0 : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xi1>
-    %3 = mhlo.select %2, %1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16>
-    return %3 : tensor<1x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp66(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp67(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp68(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16>
-    %1 = mhlo.compare  GT, %arg0, %0 : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xi1>
-    %2 = mhlo.select %1, %arg1, %0 : tensor<1x64x56x56xi1>, tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp70(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp71(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp72(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
   func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16>
     return %0 : tensor<1x64x56x56xf16>
@@ -478,10 +214,10 @@ module {
     return %2 : tensor<1x64x112x112xf16>
   }
   func.func private @BatchNormGradOp75(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32>
-    %1 = mhlo.convert %arg2 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x112x112xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
+    %1 = mhlo.convert %arg0 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32>
+    %2 = mhlo.convert %arg2 : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x112x112xf32>) -> (tensor<1x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf16>
     return %3, %grad_scale, %grad_offset : tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
   }
@@ -494,215 +230,170 @@ module {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
     return %0 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<1x1000xf16>) -> tensor<1x1000xf32>
-    return %0 : tensor<1x1000xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16>
-    %1 = mhlo.convert %0 : (tensor<1000xf16>) -> tensor<1000xf32>
-    return %1 : tensor<1000xf32>
+    %1 = mhlo.reshape %0 : (tensor<1x1000xf32>) -> tensor<1000xf32>
+    %2 = mhlo.convert %1 : (tensor<1000xf32>) -> tensor<1000xf16>
+    %3 = mhlo.convert %2 : (tensor<1000xf16>) -> tensor<1000xf32>
+    return %3 : tensor<1000xf32>
   }
-  func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
     return %0 : tensor<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
     return %0 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
     return %0 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
     return %0 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
     return %0 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    return %0 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    return %0 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
     return %0 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
     return %0 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
     return %0 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    return %0 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    return %0 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
     return %0 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
     return %0 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
     return %0 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %0 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %0 : tensor<512x512x3x3xf32>
-  }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %1 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %2 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16>
-    %3 = call @Unknown0(%2, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %4:3 = call @BatchNormGradOp1(%arg137, %arg39, %3) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %5 = call @ConvBackwardDataOp2(%4#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %6 = call @ConvBackwardFilterOp3(%arg135, %4#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %7 = call @Unknown4(%arg135, %5) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %8:3 = call @BatchNormGradOp5(%arg134, %arg37, %7) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %9 = call @ConvBackwardDataOp6(%8#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %10 = call @ConvBackwardFilterOp7(%arg132, %8#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %11 = call @Unknown8(%3, %9, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %12:3 = call @BatchNormGradOp9(%arg129, %arg33, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %13 = call @ConvBackwardDataOp10(%12#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %14 = call @ConvBackwardFilterOp11(%arg127, %12#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %15 = call @Unknown12(%arg127, %13) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %16:3 = call @BatchNormGradOp13(%arg126, %arg31, %15) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %17 = call @ConvBackwardDataOp14(%16#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %18 = call @ConvBackwardFilterOp15(%arg124, %16#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16>
-    %19:3 = call @BatchNormGradOp16(%arg131, %arg35, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %20 = call @ConvBackwardDataOp17(%19#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16>
-    %21 = call @ConvBackwardFilterOp18(%arg124, %19#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16>
-    %22 = call @Unknown19(%20, %17, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %23:3 = call @BatchNormGradOp20(%arg123, %arg29, %22) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %24 = call @ConvBackwardDataOp21(%23#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %25 = call @ConvBackwardFilterOp22(%arg121, %23#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %26 = call @Unknown23(%arg121, %24) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %27:3 = call @BatchNormGradOp24(%arg120, %arg27, %26) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %28 = call @ConvBackwardDataOp25(%27#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %29 = call @ConvBackwardFilterOp26(%arg118, %27#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %30 = call @Unknown27(%22, %28, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %31:3 = call @BatchNormGradOp28(%arg115, %arg23, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %32 = call @ConvBackwardDataOp29(%31#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %33 = call @ConvBackwardFilterOp30(%arg113, %31#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %34 = call @Unknown31(%arg113, %32) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %35:3 = call @BatchNormGradOp32(%arg112, %arg21, %34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %36 = call @ConvBackwardDataOp33(%35#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %37 = call @ConvBackwardFilterOp34(%arg110, %35#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16>
-    %38:3 = call @BatchNormGradOp35(%arg117, %arg25, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %39 = call @ConvBackwardDataOp36(%38#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16>
-    %40 = call @ConvBackwardFilterOp37(%arg110, %38#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16>
-    %41 = call @Unknown38(%39, %36, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %42:3 = call @BatchNormGradOp39(%arg109, %arg19, %41) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %43 = call @ConvBackwardDataOp40(%42#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %44 = call @ConvBackwardFilterOp41(%arg107, %42#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %45 = call @Unknown42(%arg107, %43) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %46:3 = call @BatchNormGradOp43(%arg106, %arg17, %45) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %47 = call @ConvBackwardDataOp44(%46#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %48 = call @ConvBackwardFilterOp45(%arg104, %46#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %49 = call @Unknown46(%41, %47, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %50:3 = call @BatchNormGradOp47(%arg101, %arg13, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %51 = call @ConvBackwardDataOp48(%50#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %52 = call @ConvBackwardFilterOp49(%arg99, %50#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %53 = call @Unknown50(%arg99, %51) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %54:3 = call @BatchNormGradOp51(%arg98, %arg11, %53) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %55 = call @ConvBackwardDataOp52(%54#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %56 = call @ConvBackwardFilterOp53(%arg96, %54#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16>
-    %57:3 = call @BatchNormGradOp54(%arg103, %arg15, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %58 = call @ConvBackwardDataOp55(%57#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16>
-    %59 = call @ConvBackwardFilterOp56(%arg96, %57#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16>
-    %60 = call @Unknown57(%58, %55, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %61:3 = call @BatchNormGradOp58(%arg95, %arg9, %60) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %62 = call @ConvBackwardDataOp59(%61#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %63 = call @ConvBackwardFilterOp60(%arg93, %61#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %64 = call @Unknown61(%arg93, %62) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %65:3 = call @BatchNormGradOp62(%arg92, %arg7, %64) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %66 = call @ConvBackwardDataOp63(%65#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %67 = call @ConvBackwardFilterOp64(%arg90, %65#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %68 = call @Unknown65(%60, %66, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %69:3 = call @BatchNormGradOp66(%arg89, %arg5, %68) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %70 = call @ConvBackwardDataOp67(%69#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %71 = call @ConvBackwardFilterOp68(%arg87, %69#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %72 = call @Unknown69(%arg87, %70) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %73:3 = call @BatchNormGradOp70(%arg86, %arg3, %72) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %74 = call @ConvBackwardDataOp71(%73#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %75 = call @ConvBackwardFilterOp72(%arg84, %73#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %76 = call @Unknown73(%68, %74) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %77 = "mhlo.select_and_scatter"(%arg83, %76, %1) ({
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16>
+    %2 = call @Unknown0(%1, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %3:3 = call @BatchNormGradOp1(%arg137, %arg39, %2) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %4 = call @ConvBackwardDataOp2(%3#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %5 = call @ConvBackwardFilterOp3(%arg135, %3#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %6 = call @Unknown4(%arg135, %4) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %7:3 = call @BatchNormGradOp1(%arg134, %arg37, %6) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %8 = call @ConvBackwardDataOp2(%7#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %9 = call @ConvBackwardFilterOp3(%arg132, %7#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %10 = call @Unknown8(%2, %8, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %11:3 = call @BatchNormGradOp1(%arg129, %arg33, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %12 = call @ConvBackwardDataOp2(%11#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %13 = call @ConvBackwardFilterOp3(%arg127, %11#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %14 = call @Unknown4(%arg127, %12) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %15:3 = call @BatchNormGradOp1(%arg126, %arg31, %14) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %16 = call @ConvBackwardDataOp14(%15#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %17 = call @ConvBackwardFilterOp15(%arg124, %15#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16>
+    %18:3 = call @BatchNormGradOp1(%arg131, %arg35, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %19 = call @ConvBackwardDataOp17(%18#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16>
+    %20 = call @ConvBackwardFilterOp18(%arg124, %18#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16>
+    %21 = call @Unknown19(%19, %16, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %22:3 = call @BatchNormGradOp20(%arg123, %arg29, %21) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %23 = call @ConvBackwardDataOp21(%22#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %24 = call @ConvBackwardFilterOp22(%arg121, %22#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %25 = call @Unknown23(%arg121, %23) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %26:3 = call @BatchNormGradOp20(%arg120, %arg27, %25) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %27 = call @ConvBackwardDataOp21(%26#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %28 = call @ConvBackwardFilterOp22(%arg118, %26#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %29 = call @Unknown19(%21, %27, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %30:3 = call @BatchNormGradOp20(%arg115, %arg23, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %31 = call @ConvBackwardDataOp21(%30#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %32 = call @ConvBackwardFilterOp22(%arg113, %30#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %33 = call @Unknown23(%arg113, %31) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %34:3 = call @BatchNormGradOp20(%arg112, %arg21, %33) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %35 = call @ConvBackwardDataOp33(%34#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %36 = call @ConvBackwardFilterOp34(%arg110, %34#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16>
+    %37:3 = call @BatchNormGradOp20(%arg117, %arg25, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %38 = call @ConvBackwardDataOp36(%37#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16>
+    %39 = call @ConvBackwardFilterOp37(%arg110, %37#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16>
+    %40 = call @Unknown38(%38, %35, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %41:3 = call @BatchNormGradOp39(%arg109, %arg19, %40) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %42 = call @ConvBackwardDataOp40(%41#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %43 = call @ConvBackwardFilterOp41(%arg107, %41#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %44 = call @Unknown42(%arg107, %42) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %45:3 = call @BatchNormGradOp39(%arg106, %arg17, %44) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %46 = call @ConvBackwardDataOp40(%45#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %47 = call @ConvBackwardFilterOp41(%arg104, %45#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %48 = call @Unknown38(%40, %46, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %49:3 = call @BatchNormGradOp39(%arg101, %arg13, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %50 = call @ConvBackwardDataOp40(%49#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %51 = call @ConvBackwardFilterOp41(%arg99, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %52 = call @Unknown42(%arg99, %50) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %53:3 = call @BatchNormGradOp39(%arg98, %arg11, %52) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %54 = call @ConvBackwardDataOp52(%53#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %55 = call @ConvBackwardFilterOp53(%arg96, %53#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16>
+    %56:3 = call @BatchNormGradOp39(%arg103, %arg15, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %57 = call @ConvBackwardDataOp55(%56#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16>
+    %58 = call @ConvBackwardFilterOp56(%arg96, %56#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16>
+    %59 = call @Unknown57(%57, %54, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %60:3 = call @BatchNormGradOp58(%arg95, %arg9, %59) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %61 = call @ConvBackwardDataOp59(%60#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %62 = call @ConvBackwardFilterOp60(%arg93, %60#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %63 = call @Unknown61(%arg93, %61) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %64:3 = call @BatchNormGradOp58(%arg92, %arg7, %63) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %65 = call @ConvBackwardDataOp59(%64#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %66 = call @ConvBackwardFilterOp60(%arg90, %64#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %67 = call @Unknown57(%59, %65, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %68:3 = call @BatchNormGradOp58(%arg89, %arg5, %67) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %69 = call @ConvBackwardDataOp59(%68#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %70 = call @ConvBackwardFilterOp60(%arg87, %68#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %71 = call @Unknown61(%arg87, %69) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %72:3 = call @BatchNormGradOp58(%arg86, %arg3, %71) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %73 = call @ConvBackwardDataOp59(%72#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %74 = call @ConvBackwardFilterOp60(%arg84, %72#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %75 = call @Unknown73(%67, %73) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %76 = "mhlo.select_and_scatter"(%arg83, %75, %0) ({
     ^bb0(%arg142: tensor<f16>, %arg143: tensor<f16>):
-      %107 = mhlo.compare  GE, %arg142, %arg143 : (tensor<f16>, tensor<f16>) -> tensor<i1>
-      mhlo.return %107 : tensor<i1>
+      %104 = mhlo.compare  GE, %arg142, %arg143 : (tensor<f16>, tensor<f16>) -> tensor<i1>
+      mhlo.return %104 : tensor<i1>
     }, {
     ^bb0(%arg142: tensor<f16>, %arg143: tensor<f16>):
-      %107 = mhlo.add %arg142, %arg143 : tensor<f16>
-      mhlo.return %107 : tensor<f16>
+      %104 = mhlo.add %arg142, %arg143 : tensor<f16>
+      mhlo.return %104 : tensor<f16>
     }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<f16>) -> tensor<1x64x112x112xf16>
-    %78 = call @Unknown74(%arg83, %77) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
-    %79:3 = call @BatchNormGradOp75(%arg82, %arg1, %78) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %80 = call @ConvBackwardFilterOp76(%arg81, %79#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16>
-    %81 = call @Unknown77(%80) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %82 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32>
-    %83 = mhlo.reduce(%82 init: %0) across dimensions = [0] : (tensor<1x1000xf32>, tensor<f32>) -> tensor<1000xf32>
-     reducer(%arg142: tensor<f32>, %arg143: tensor<f32>)  {
-      %107 = mhlo.add %arg142, %arg143 : tensor<f32>
-      mhlo.return %107 : tensor<f32>
-    }
-    %84 = call @Unknown79(%83) : (tensor<1000xf32>) -> tensor<1000xf32>
-    %85 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16>
-    %86 = "mhlo.dot"(%85, %arg139) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16>
-    %87 = call @Unknown80(%86) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %88 = call @Unknown81(%75) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %89 = call @Unknown82(%71) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %90 = call @Unknown83(%67) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %91 = call @Unknown84(%63) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %92 = call @Unknown85(%56) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %93 = call @Unknown86(%52) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %94 = call @Unknown87(%59) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %95 = call @Unknown88(%48) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %96 = call @Unknown89(%44) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %97 = call @Unknown90(%37) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %98 = call @Unknown91(%33) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %99 = call @Unknown92(%40) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %100 = call @Unknown93(%29) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %101 = call @Unknown94(%25) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %102 = call @Unknown95(%18) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %103 = call @Unknown96(%14) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %104 = call @Unknown97(%21) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %105 = call @Unknown98(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %106 = call @Unknown99(%6) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %79#2, %79#1, %81, %84, %87, %73#2, %73#1, %69#2, %69#1, %88, %89, %65#2, %65#1, %61#2, %61#1, %90, %91, %54#2, %54#1, %50#2, %50#1, %92, %93, %94, %57#2, %57#1, %46#2, %46#1, %42#2, %42#1, %95, %96, %35#2, %35#1, %31#2, %31#1, %97, %98, %99, %38#2, %38#1, %27#2, %27#1, %23#2, %23#1, %100, %101, %16#2, %16#1, %12#2, %12#1, %102, %103, %104, %19#2, %19#1, %8#2, %8#1, %4#2, %4#1, %105, %106 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
+    %77 = call @Unknown74(%arg83, %76) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
+    %78:3 = call @BatchNormGradOp75(%arg82, %arg1, %77) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %79 = call @ConvBackwardFilterOp76(%arg81, %78#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16>
+    %80 = call @Unknown77(%79) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
+    %81 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32>
+    %82 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16>
+    %83 = "mhlo.dot"(%82, %arg139) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16>
+    %84 = call @Unknown79(%83) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %85 = call @Unknown80(%74) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %86 = call @Unknown80(%70) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %87 = call @Unknown80(%66) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %88 = call @Unknown80(%62) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %89 = call @Unknown84(%55) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %90 = call @Unknown85(%51) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %91 = call @Unknown86(%58) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %92 = call @Unknown85(%47) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %93 = call @Unknown85(%43) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %94 = call @Unknown89(%36) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %95 = call @Unknown90(%32) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %96 = call @Unknown91(%39) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %97 = call @Unknown90(%28) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %98 = call @Unknown90(%24) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %99 = call @Unknown94(%17) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %100 = call @Unknown95(%13) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %101 = call @Unknown96(%20) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %102 = call @Unknown95(%9) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %103 = call @Unknown95(%5) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    return %78#2, %78#1, %80, %81, %84, %72#2, %72#1, %68#2, %68#1, %85, %86, %64#2, %64#1, %60#2, %60#1, %87, %88, %53#2, %53#1, %49#2, %49#1, %89, %90, %91, %56#2, %56#1, %45#2, %45#1, %41#2, %41#1, %92, %93, %34#2, %34#1, %30#2, %30#1, %94, %95, %96, %37#2, %37#1, %26#2, %26#1, %22#2, %22#1, %97, %98, %15#2, %15#1, %11#2, %11#1, %99, %100, %101, %18#2, %18#1, %7#2, %7#1, %3#2, %3#1, %102, %103 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir
index 08a074f1b..17261feae 100644
--- a/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/3_byre_tensor_opt.mlir
@@ -2,22 +2,36 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#map3 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
 module {
   func.func private @Unknown0(%arg0: tensor<1x512xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant 4.900000e+01 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<1x512x7x7xf16>, tensor<1x512xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %2 = arith.divf %in_1, %cst_0 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2] [1, 1] [1, 1] : tensor<1x512xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %6 = arith.divf %in, %cst_0 : f16
+            %7 = arith.cmpf ogt, %in_2, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @BatchNormGradOp1(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
@@ -40,85 +54,63 @@ module {
     return %1 : tensor<512x512x3x3xf16>
   }
   func.func private @Unknown4(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp5(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp6(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp7(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
   func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @BatchNormGradOp9(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp10(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp11(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp13(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @ConvBackwardDataOp14(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16>
@@ -130,14 +122,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16>
     return %1 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormGradOp16(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x512x7x7xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @ConvBackwardDataOp17(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<1x256x14x14xf16>
@@ -149,15 +133,33 @@ module {
     return %1 : tensor<512x256x1x1xf16>
   }
   func.func private @Unknown19(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @BatchNormGradOp20(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
@@ -180,85 +182,33 @@ module {
     return %1 : tensor<256x256x3x3xf16>
   }
   func.func private @Unknown23(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp24(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp25(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp26(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp28(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp29(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp30(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp32(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @ConvBackwardDataOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16>
@@ -270,14 +220,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16>
     return %1 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormGradOp35(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<1x256x14x14xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @ConvBackwardDataOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<1x128x28x28xf16>
@@ -289,15 +231,33 @@ module {
     return %1 : tensor<256x128x1x1xf16>
   }
   func.func private @Unknown38(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @BatchNormGradOp39(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
@@ -320,85 +280,33 @@ module {
     return %1 : tensor<128x128x3x3xf16>
   }
   func.func private @Unknown42(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp43(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp44(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp45(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp47(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp48(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp49(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp51(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @ConvBackwardDataOp52(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16>
@@ -410,14 +318,6 @@ module {
     %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16>
     return %1 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormGradOp54(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<1x128x28x28xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @ConvBackwardDataOp55(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<1x64x56x56xf16>
@@ -429,15 +329,33 @@ module {
     return %1 : tensor<128x64x1x1xf16>
   }
   func.func private @Unknown57(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @BatchNormGradOp58(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
@@ -460,114 +378,85 @@ module {
     return %1 : tensor<64x64x3x3xf16>
   }
   func.func private @Unknown61(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp62(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp63(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp64(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp66(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp67(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp68(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp70(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<1x64x56x56xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp71(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>, permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, minor_to_major = dense<[1, 0, 2, 3]> : tensor<4xindex>} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp72(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>, permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
   func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown74(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x112x112xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x112x112xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c112 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x112x112xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x112x112xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x112x112xf16>
+          scf.yield %inserted_slice : tensor<1x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<1x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<1x64x112x112xf16>
+    }
     return %1 : tensor<1x64x112x112xf16>
   }
   func.func private @BatchNormGradOp75(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
@@ -584,333 +473,455 @@ module {
     return %1 : tensor<64x3x7x7xf16>
   }
   func.func private @Unknown77(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x3x7x7xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x3x7x7xf32>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf32>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf32>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf32>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf32>
+    }
     return %1 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x1000xf16>) outs(%0 : tensor<1x1000xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1x1000xf32>
-    return %1 : tensor<1x1000xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %2 = arith.truncf %in : f32 to f16
-      %3 = arith.extf %2 : f16 to f32
-      linalg.yield %3 : f32
-    } -> tensor<1000xf32>
-    return %1 : tensor<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1x1000xf32>) {
+      %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f16, %out: f32):
+        %4 = arith.extf %in : f16 to f32
+        %5 = arith.truncf %4 : f32 to f16
+        %6 = arith.extf %5 : f16 to f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor<f32> into tensor<1x1000xf32>
+      scf.yield %inserted_slice : tensor<1x1000xf32>
+    }
+    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
+    return %collapsed : tensor<1000xf32>
+  }
+  func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1000x512xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f32> into tensor<1000x512xf32>
+        scf.yield %inserted_slice : tensor<1000x512xf32>
+      }
+      scf.yield %2 : tensor<1000x512xf32>
+    }
     return %1 : tensor<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf32>
+    }
     return %1 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf32>
+    }
     return %1 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf32>
+    }
     return %1 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x1x1xf32>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf32>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf32>
+    }
     return %1 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf32>
+    }
     return %1 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf32>
+    }
     return %1 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x1x1xf32>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf32>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf32>
+    }
     return %1 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf32>
+    }
     return %1 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x512x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf32>
+    }
     return %1 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x1x1xf32>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf32>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf32>
+    }
     return %1 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %1 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %2 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16>
-    %3 = call @Unknown0(%2, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %4:3 = call @BatchNormGradOp1(%arg137, %arg39, %3) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %5 = call @ConvBackwardDataOp2(%4#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %6 = call @ConvBackwardFilterOp3(%arg135, %4#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %7 = call @Unknown4(%arg135, %5) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %8:3 = call @BatchNormGradOp5(%arg134, %arg37, %7) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %9 = call @ConvBackwardDataOp6(%8#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %10 = call @ConvBackwardFilterOp7(%arg132, %8#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %11 = call @Unknown8(%3, %9, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %12:3 = call @BatchNormGradOp9(%arg129, %arg33, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %13 = call @ConvBackwardDataOp10(%12#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %14 = call @ConvBackwardFilterOp11(%arg127, %12#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %15 = call @Unknown12(%arg127, %13) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %16:3 = call @BatchNormGradOp13(%arg126, %arg31, %15) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %17 = call @ConvBackwardDataOp14(%16#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %18 = call @ConvBackwardFilterOp15(%arg124, %16#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16>
-    %19:3 = call @BatchNormGradOp16(%arg131, %arg35, %11) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %20 = call @ConvBackwardDataOp17(%19#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16>
-    %21 = call @ConvBackwardFilterOp18(%arg124, %19#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16>
-    %22 = call @Unknown19(%20, %17, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %23:3 = call @BatchNormGradOp20(%arg123, %arg29, %22) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %24 = call @ConvBackwardDataOp21(%23#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %25 = call @ConvBackwardFilterOp22(%arg121, %23#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %26 = call @Unknown23(%arg121, %24) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %27:3 = call @BatchNormGradOp24(%arg120, %arg27, %26) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %28 = call @ConvBackwardDataOp25(%27#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %29 = call @ConvBackwardFilterOp26(%arg118, %27#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %30 = call @Unknown27(%22, %28, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %31:3 = call @BatchNormGradOp28(%arg115, %arg23, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %32 = call @ConvBackwardDataOp29(%31#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %33 = call @ConvBackwardFilterOp30(%arg113, %31#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %34 = call @Unknown31(%arg113, %32) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %35:3 = call @BatchNormGradOp32(%arg112, %arg21, %34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %36 = call @ConvBackwardDataOp33(%35#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %37 = call @ConvBackwardFilterOp34(%arg110, %35#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16>
-    %38:3 = call @BatchNormGradOp35(%arg117, %arg25, %30) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %39 = call @ConvBackwardDataOp36(%38#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16>
-    %40 = call @ConvBackwardFilterOp37(%arg110, %38#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16>
-    %41 = call @Unknown38(%39, %36, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %42:3 = call @BatchNormGradOp39(%arg109, %arg19, %41) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %43 = call @ConvBackwardDataOp40(%42#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %44 = call @ConvBackwardFilterOp41(%arg107, %42#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %45 = call @Unknown42(%arg107, %43) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %46:3 = call @BatchNormGradOp43(%arg106, %arg17, %45) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %47 = call @ConvBackwardDataOp44(%46#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %48 = call @ConvBackwardFilterOp45(%arg104, %46#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %49 = call @Unknown46(%41, %47, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %50:3 = call @BatchNormGradOp47(%arg101, %arg13, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %51 = call @ConvBackwardDataOp48(%50#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %52 = call @ConvBackwardFilterOp49(%arg99, %50#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %53 = call @Unknown50(%arg99, %51) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %54:3 = call @BatchNormGradOp51(%arg98, %arg11, %53) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %55 = call @ConvBackwardDataOp52(%54#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %56 = call @ConvBackwardFilterOp53(%arg96, %54#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16>
-    %57:3 = call @BatchNormGradOp54(%arg103, %arg15, %49) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %58 = call @ConvBackwardDataOp55(%57#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16>
-    %59 = call @ConvBackwardFilterOp56(%arg96, %57#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16>
-    %60 = call @Unknown57(%58, %55, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %61:3 = call @BatchNormGradOp58(%arg95, %arg9, %60) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %62 = call @ConvBackwardDataOp59(%61#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %63 = call @ConvBackwardFilterOp60(%arg93, %61#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %64 = call @Unknown61(%arg93, %62) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %65:3 = call @BatchNormGradOp62(%arg92, %arg7, %64) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %66 = call @ConvBackwardDataOp63(%65#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %67 = call @ConvBackwardFilterOp64(%arg90, %65#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %68 = call @Unknown65(%60, %66, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %69:3 = call @BatchNormGradOp66(%arg89, %arg5, %68) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %70 = call @ConvBackwardDataOp67(%69#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %71 = call @ConvBackwardFilterOp68(%arg87, %69#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %72 = call @Unknown69(%arg87, %70) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %73:3 = call @BatchNormGradOp70(%arg86, %arg3, %72) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %74 = call @ConvBackwardDataOp71(%73#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %75 = call @ConvBackwardFilterOp72(%arg84, %73#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %76 = call @Unknown73(%68, %74) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %77 = "mhlo.select_and_scatter"(%arg83, %76, %1) ({
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = "mhlo.dot_general"(%arg141, %arg140) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x1000xf16>, tensor<512x1000xf16>) -> tensor<1x512xf16>
+    %2 = call @Unknown0(%1, %arg138) : (tensor<1x512xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %3:3 = call @BatchNormGradOp1(%arg137, %arg39, %2) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %4 = call @ConvBackwardDataOp2(%3#0, %arg136) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %5 = call @ConvBackwardFilterOp3(%arg135, %3#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %6 = call @Unknown4(%arg135, %4) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %7:3 = call @BatchNormGradOp1(%arg134, %arg37, %6) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %8 = call @ConvBackwardDataOp2(%7#0, %arg133) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %9 = call @ConvBackwardFilterOp3(%arg132, %7#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %10 = call @Unknown8(%2, %8, %arg132) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %11:3 = call @BatchNormGradOp1(%arg129, %arg33, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %12 = call @ConvBackwardDataOp2(%11#0, %arg128) : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %13 = call @ConvBackwardFilterOp3(%arg127, %11#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %14 = call @Unknown4(%arg127, %12) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %15:3 = call @BatchNormGradOp1(%arg126, %arg31, %14) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %16 = call @ConvBackwardDataOp14(%15#0, %arg125) : (tensor<1x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %17 = call @ConvBackwardFilterOp15(%arg124, %15#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x3x3xf16>
+    %18:3 = call @BatchNormGradOp1(%arg131, %arg35, %10) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<1x512x7x7xf16>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %19 = call @ConvBackwardDataOp17(%18#0, %arg130) : (tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<1x256x14x14xf16>
+    %20 = call @ConvBackwardFilterOp18(%arg124, %18#0) : (tensor<1x256x14x14xf16>, tensor<1x512x7x7xf16>) -> tensor<512x256x1x1xf16>
+    %21 = call @Unknown19(%19, %16, %arg124) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %22:3 = call @BatchNormGradOp20(%arg123, %arg29, %21) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %23 = call @ConvBackwardDataOp21(%22#0, %arg122) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %24 = call @ConvBackwardFilterOp22(%arg121, %22#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %25 = call @Unknown23(%arg121, %23) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %26:3 = call @BatchNormGradOp20(%arg120, %arg27, %25) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %27 = call @ConvBackwardDataOp21(%26#0, %arg119) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %28 = call @ConvBackwardFilterOp22(%arg118, %26#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %29 = call @Unknown19(%21, %27, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %30:3 = call @BatchNormGradOp20(%arg115, %arg23, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %31 = call @ConvBackwardDataOp21(%30#0, %arg114) : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %32 = call @ConvBackwardFilterOp22(%arg113, %30#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %33 = call @Unknown23(%arg113, %31) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %34:3 = call @BatchNormGradOp20(%arg112, %arg21, %33) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %35 = call @ConvBackwardDataOp33(%34#0, %arg111) : (tensor<1x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %36 = call @ConvBackwardFilterOp34(%arg110, %34#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x3x3xf16>
+    %37:3 = call @BatchNormGradOp20(%arg117, %arg25, %29) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<1x256x14x14xf16>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %38 = call @ConvBackwardDataOp36(%37#0, %arg116) : (tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<1x128x28x28xf16>
+    %39 = call @ConvBackwardFilterOp37(%arg110, %37#0) : (tensor<1x128x28x28xf16>, tensor<1x256x14x14xf16>) -> tensor<256x128x1x1xf16>
+    %40 = call @Unknown38(%38, %35, %arg110) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %41:3 = call @BatchNormGradOp39(%arg109, %arg19, %40) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %42 = call @ConvBackwardDataOp40(%41#0, %arg108) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %43 = call @ConvBackwardFilterOp41(%arg107, %41#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %44 = call @Unknown42(%arg107, %42) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %45:3 = call @BatchNormGradOp39(%arg106, %arg17, %44) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %46 = call @ConvBackwardDataOp40(%45#0, %arg105) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %47 = call @ConvBackwardFilterOp41(%arg104, %45#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %48 = call @Unknown38(%40, %46, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %49:3 = call @BatchNormGradOp39(%arg101, %arg13, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %50 = call @ConvBackwardDataOp40(%49#0, %arg100) : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %51 = call @ConvBackwardFilterOp41(%arg99, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %52 = call @Unknown42(%arg99, %50) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %53:3 = call @BatchNormGradOp39(%arg98, %arg11, %52) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %54 = call @ConvBackwardDataOp52(%53#0, %arg97) : (tensor<1x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %55 = call @ConvBackwardFilterOp53(%arg96, %53#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x3x3xf16>
+    %56:3 = call @BatchNormGradOp39(%arg103, %arg15, %48) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<1x128x28x28xf16>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %57 = call @ConvBackwardDataOp55(%56#0, %arg102) : (tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<1x64x56x56xf16>
+    %58 = call @ConvBackwardFilterOp56(%arg96, %56#0) : (tensor<1x64x56x56xf16>, tensor<1x128x28x28xf16>) -> tensor<128x64x1x1xf16>
+    %59 = call @Unknown57(%57, %54, %arg96) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %60:3 = call @BatchNormGradOp58(%arg95, %arg9, %59) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %61 = call @ConvBackwardDataOp59(%60#0, %arg94) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %62 = call @ConvBackwardFilterOp60(%arg93, %60#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %63 = call @Unknown61(%arg93, %61) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %64:3 = call @BatchNormGradOp58(%arg92, %arg7, %63) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %65 = call @ConvBackwardDataOp59(%64#0, %arg91) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %66 = call @ConvBackwardFilterOp60(%arg90, %64#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %67 = call @Unknown57(%59, %65, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %68:3 = call @BatchNormGradOp58(%arg89, %arg5, %67) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %69 = call @ConvBackwardDataOp59(%68#0, %arg88) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %70 = call @ConvBackwardFilterOp60(%arg87, %68#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %71 = call @Unknown61(%arg87, %69) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %72:3 = call @BatchNormGradOp58(%arg86, %arg3, %71) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<1x64x56x56xf16>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %73 = call @ConvBackwardDataOp59(%72#0, %arg85) : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %74 = call @ConvBackwardFilterOp60(%arg84, %72#0) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %75 = call @Unknown73(%67, %73) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %76 = "mhlo.select_and_scatter"(%arg83, %75, %0) ({
     ^bb0(%arg142: tensor<f16>, %arg143: tensor<f16>):
-      %107 = mhlo.compare  GE, %arg142, %arg143 : (tensor<f16>, tensor<f16>) -> tensor<i1>
-      mhlo.return %107 : tensor<i1>
+      %104 = mhlo.compare  GE, %arg142, %arg143 : (tensor<f16>, tensor<f16>) -> tensor<i1>
+      mhlo.return %104 : tensor<i1>
     }, {
     ^bb0(%arg142: tensor<f16>, %arg143: tensor<f16>):
-      %107 = mhlo.add %arg142, %arg143 : tensor<f16>
-      mhlo.return %107 : tensor<f16>
+      %104 = mhlo.add %arg142, %arg143 : tensor<f16>
+      mhlo.return %104 : tensor<f16>
     }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<f16>) -> tensor<1x64x112x112xf16>
-    %78 = call @Unknown74(%arg83, %77) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
-    %79:3 = call @BatchNormGradOp75(%arg82, %arg1, %78) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %80 = call @ConvBackwardFilterOp76(%arg81, %79#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16>
-    %81 = call @Unknown77(%80) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %82 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32>
-    %83 = mhlo.reduce(%82 init: %0) across dimensions = [0] : (tensor<1x1000xf32>, tensor<f32>) -> tensor<1000xf32>
-     reducer(%arg142: tensor<f32>, %arg143: tensor<f32>)  {
-      %107 = mhlo.add %arg142, %arg143 : tensor<f32>
-      mhlo.return %107 : tensor<f32>
-    }
-    %84 = call @Unknown79(%83) : (tensor<1000xf32>) -> tensor<1000xf32>
-    %85 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16>
-    %86 = "mhlo.dot"(%85, %arg139) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16>
-    %87 = call @Unknown80(%86) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %88 = call @Unknown81(%75) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %89 = call @Unknown82(%71) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %90 = call @Unknown83(%67) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %91 = call @Unknown84(%63) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %92 = call @Unknown85(%56) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %93 = call @Unknown86(%52) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %94 = call @Unknown87(%59) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %95 = call @Unknown88(%48) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %96 = call @Unknown89(%44) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %97 = call @Unknown90(%37) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %98 = call @Unknown91(%33) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %99 = call @Unknown92(%40) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %100 = call @Unknown93(%29) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %101 = call @Unknown94(%25) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %102 = call @Unknown95(%18) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %103 = call @Unknown96(%14) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %104 = call @Unknown97(%21) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %105 = call @Unknown98(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %106 = call @Unknown99(%6) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %79#2, %79#1, %81, %84, %87, %73#2, %73#1, %69#2, %69#1, %88, %89, %65#2, %65#1, %61#2, %61#1, %90, %91, %54#2, %54#1, %50#2, %50#1, %92, %93, %94, %57#2, %57#1, %46#2, %46#1, %42#2, %42#1, %95, %96, %35#2, %35#1, %31#2, %31#1, %97, %98, %99, %38#2, %38#1, %27#2, %27#1, %23#2, %23#1, %100, %101, %16#2, %16#1, %12#2, %12#1, %102, %103, %104, %19#2, %19#1, %8#2, %8#1, %4#2, %4#1, %105, %106 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
+    %77 = call @Unknown74(%arg83, %76) : (tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
+    %78:3 = call @BatchNormGradOp75(%arg82, %arg1, %77) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<1x64x112x112xf16>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %79 = call @ConvBackwardFilterOp76(%arg81, %78#0) : (tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) -> tensor<64x3x7x7xf16>
+    %80 = call @Unknown77(%79) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
+    %81 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32>
+    %82 = mhlo.reshape %arg141 : (tensor<1x1000xf16>) -> tensor<1000x1xf16>
+    %83 = "mhlo.dot"(%82, %arg139) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1000x1xf16>, tensor<1x512xf16>) -> tensor<1000x512xf16>
+    %84 = call @Unknown79(%83) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %85 = call @Unknown80(%74) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %86 = call @Unknown80(%70) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %87 = call @Unknown80(%66) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %88 = call @Unknown80(%62) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %89 = call @Unknown84(%55) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %90 = call @Unknown85(%51) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %91 = call @Unknown86(%58) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %92 = call @Unknown85(%47) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %93 = call @Unknown85(%43) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %94 = call @Unknown89(%36) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %95 = call @Unknown90(%32) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %96 = call @Unknown91(%39) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %97 = call @Unknown90(%28) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %98 = call @Unknown90(%24) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %99 = call @Unknown94(%17) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %100 = call @Unknown95(%13) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %101 = call @Unknown96(%20) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %102 = call @Unknown95(%9) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %103 = call @Unknown95(%5) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    return %78#2, %78#1, %80, %81, %84, %72#2, %72#1, %68#2, %68#1, %85, %86, %64#2, %64#1, %60#2, %60#1, %87, %88, %53#2, %53#1, %49#2, %49#1, %89, %90, %91, %56#2, %56#1, %45#2, %45#1, %41#2, %41#1, %92, %93, %34#2, %34#1, %30#2, %30#1, %94, %95, %96, %37#2, %37#1, %26#2, %26#1, %22#2, %22#1, %97, %98, %15#2, %15#1, %11#2, %11#1, %99, %100, %101, %18#2, %18#1, %7#2, %7#1, %3#2, %3#1, %102, %103 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir
index 2db090757..9a8d1f4c0 100644
--- a/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/4_bufferize_opt.mlir
@@ -2,424 +2,661 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#map3 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
 module {
   func.func private @Unknown0(%arg0: tensor<1x512xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant 4.900000e+01 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<1x512x7x7xf16>, tensor<1x512xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %2 = arith.divf %in_1, %cst_0 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2] [1, 1] [1, 1] : tensor<1x512xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %6 = arith.divf %in, %cst_0 : f16
+            %7 = arith.cmpf ogt, %in_2, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @Unknown4(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @Unknown8(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>, %arg2: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @Unknown19(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown23(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>, %arg2: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown38(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown42(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown46(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>, %arg2: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown50(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown57(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_1 = tensor.extract_slice %arg2[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_2: f16, %in_3: f16, %out: f16):
+            %6 = arith.addf %in, %in_2 : f16
+            %7 = arith.cmpf ogt, %in_3, %cst : f16
+            %8 = arith.select %7, %6, %cst : f16
+            linalg.yield %8 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown61(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown65(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>, %arg2: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.cmpf ogt, %in, %cst : f16
-      %4 = arith.select %3, %2, %cst : f16
-      linalg.yield %4 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown69(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown73(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown74(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.cmpf ogt, %in, %cst : f16
-      %3 = arith.select %2, %in_0, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x112x112xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x112x112xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c112 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x112x112xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x112x112xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.cmpf ogt, %in, %cst : f16
+            %7 = arith.select %6, %in_1, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x112x112xf16>
+          scf.yield %inserted_slice : tensor<1x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<1x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<1x64x112x112xf16>
+    }
     return %1 : tensor<1x64x112x112xf16>
   }
   func.func private @Unknown77(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x3x7x7xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x3x7x7xf32>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf32>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf32>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf32>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf32>
+    }
     return %1 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<1x1000xf16>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x1000xf16>) outs(%0 : tensor<1x1000xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1x1000xf32>
-    return %1 : tensor<1x1000xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %2 = arith.truncf %in : f32 to f16
-      %3 = arith.extf %2 : f16 to f32
-      linalg.yield %3 : f32
-    } -> tensor<1000xf32>
-    return %1 : tensor<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1x1000xf32>) {
+      %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f16, %out: f32):
+        %4 = arith.extf %in : f16 to f32
+        %5 = arith.truncf %4 : f32 to f16
+        %6 = arith.extf %5 : f16 to f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor<f32> into tensor<1x1000xf32>
+      scf.yield %inserted_slice : tensor<1x1000xf32>
+    }
+    %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<1x1000xf32> into tensor<1000xf32>
+    return %collapsed : tensor<1000xf32>
+  }
+  func.func private @Unknown79(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1000x512xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f32> into tensor<1000x512xf32>
+        scf.yield %inserted_slice : tensor<1000x512xf32>
+      }
+      scf.yield %2 : tensor<1000x512xf32>
+    }
     return %1 : tensor<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown80(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf32>
+    }
     return %1 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown83(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown85(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown84(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf32>
+    }
     return %1 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown85(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf32>
+    }
     return %1 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x1x1xf32>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf32>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf32>
+    }
     return %1 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf32>
+    }
     return %1 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown90(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf32>
+    }
     return %1 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x1x1xf32>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf32>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf32>
+    }
     return %1 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown94(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf32>
+    }
     return %1 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown95(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x512x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf32>
+    }
     return %1 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x1x1xf32>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf32>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf32>
+    }
     return %1 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<128xf32>, %arg11: tensor<128xf32>, %arg12: tensor<128xf32>, %arg13: tensor<128xf32>, %arg14: tensor<128xf32>, %arg15: tensor<128xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<256xf32>, %arg21: tensor<256xf32>, %arg22: tensor<256xf32>, %arg23: tensor<256xf32>, %arg24: tensor<256xf32>, %arg25: tensor<256xf32>, %arg26: tensor<256xf32>, %arg27: tensor<256xf32>, %arg28: tensor<256xf32>, %arg29: tensor<256xf32>, %arg30: tensor<512xf32>, %arg31: tensor<512xf32>, %arg32: tensor<512xf32>, %arg33: tensor<512xf32>, %arg34: tensor<512xf32>, %arg35: tensor<512xf32>, %arg36: tensor<512xf32>, %arg37: tensor<512xf32>, %arg38: tensor<512xf32>, %arg39: tensor<512xf32>, %arg40: tensor<64xf32>, %arg41: tensor<64xf32>, %arg42: tensor<64xf32>, %arg43: tensor<64xf32>, %arg44: tensor<64xf32>, %arg45: tensor<64xf32>, %arg46: tensor<64xf32>, %arg47: tensor<64xf32>, %arg48: tensor<64xf32>, %arg49: tensor<64xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<128xf32>, %arg53: tensor<128xf32>, %arg54: tensor<128xf32>, %arg55: tensor<128xf32>, %arg56: tensor<128xf32>, %arg57: tensor<128xf32>, %arg58: tensor<128xf32>, %arg59: tensor<128xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<512xf32>, %arg71: tensor<512xf32>, %arg72: tensor<512xf32>, %arg73: tensor<512xf32>, %arg74: tensor<512xf32>, %arg75: tensor<512xf32>, %arg76: tensor<512xf32>, %arg77: tensor<512xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<64x3x7x7xf16>, %arg81: tensor<1x3x224x224xf16>, %arg82: tensor<1x64x112x112xf16>, %arg83: tensor<1x64x112x112xf16>, %arg84: tensor<1x64x56x56xf16>, %arg85: tensor<64x64x3x3xf16>, %arg86: tensor<1x64x56x56xf16>, %arg87: tensor<1x64x56x56xf16>, %arg88: tensor<64x64x3x3xf16>, %arg89: tensor<1x64x56x56xf16>, %arg90: tensor<1x64x56x56xf16>, %arg91: tensor<64x64x3x3xf16>, %arg92: tensor<1x64x56x56xf16>, %arg93: tensor<1x64x56x56xf16>, %arg94: tensor<64x64x3x3xf16>, %arg95: tensor<1x64x56x56xf16>, %arg96: tensor<1x64x56x56xf16>, %arg97: tensor<128x64x3x3xf16>, %arg98: tensor<1x128x28x28xf16>, %arg99: tensor<1x128x28x28xf16>, %arg100: tensor<128x128x3x3xf16>, %arg101: tensor<1x128x28x28xf16>, %arg102: tensor<128x64x1x1xf16>, %arg103: tensor<1x128x28x28xf16>, %arg104: tensor<1x128x28x28xf16>, %arg105: tensor<128x128x3x3xf16>, %arg106: tensor<1x128x28x28xf16>, %arg107: tensor<1x128x28x28xf16>, %arg108: tensor<128x128x3x3xf16>, %arg109: tensor<1x128x28x28xf16>, %arg110: tensor<1x128x28x28xf16>, %arg111: tensor<256x128x3x3xf16>, %arg112: tensor<1x256x14x14xf16>, %arg113: tensor<1x256x14x14xf16>, %arg114: tensor<256x256x3x3xf16>, %arg115: tensor<1x256x14x14xf16>, %arg116: tensor<256x128x1x1xf16>, %arg117: tensor<1x256x14x14xf16>, %arg118: tensor<1x256x14x14xf16>, %arg119: tensor<256x256x3x3xf16>, %arg120: tensor<1x256x14x14xf16>, %arg121: tensor<1x256x14x14xf16>, %arg122: tensor<256x256x3x3xf16>, %arg123: tensor<1x256x14x14xf16>, %arg124: tensor<1x256x14x14xf16>, %arg125: tensor<512x256x3x3xf16>, %arg126: tensor<1x512x7x7xf16>, %arg127: tensor<1x512x7x7xf16>, %arg128: tensor<512x512x3x3xf16>, %arg129: tensor<1x512x7x7xf16>, %arg130: tensor<512x256x1x1xf16>, %arg131: tensor<1x512x7x7xf16>, %arg132: tensor<1x512x7x7xf16>, %arg133: tensor<512x512x3x3xf16>, %arg134: tensor<1x512x7x7xf16>, %arg135: tensor<1x512x7x7xf16>, %arg136: tensor<512x512x3x3xf16>, %arg137: tensor<1x512x7x7xf16>, %arg138: tensor<1x512x7x7xf16>, %arg139: tensor<1x512xf16>, %arg140: tensor<512x1000xf16>, %arg141: tensor<1x1000xf16>) -> (tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} {
     %0 = tensor.empty() : tensor<1x512xf16>
     %1 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%arg141, %arg140 : tensor<1x1000xf16>, tensor<512x1000xf16>) outs(%0 : tensor<1x512xf16>) : tensor<1x512xf16>
@@ -450,7 +687,7 @@ module {
     %26 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%24#0, %arg128 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%25 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16>
     %27 = tensor.empty() : tensor<512x512x3x3xf16>
     %28 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg127, %24#0 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%27 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
-    %29 = call @Unknown12(%arg127, %26) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %29 = call @Unknown4(%arg127, %26) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
     %30 = tensor.empty() : tensor<1x512x7x7xf16>
     %31 = tensor.empty() : tensor<512xf32>
     %32 = tensor.empty() : tensor<512xf32>
@@ -485,7 +722,7 @@ module {
     %61 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%59#0, %arg119 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%60 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16>
     %62 = tensor.empty() : tensor<256x256x3x3xf16>
     %63 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg118, %59#0 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%62 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
-    %64 = call @Unknown27(%46, %61, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %64 = call @Unknown19(%46, %61, %arg118) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
     %65 = tensor.empty() : tensor<1x256x14x14xf16>
     %66 = tensor.empty() : tensor<256xf32>
     %67 = tensor.empty() : tensor<256xf32>
@@ -494,7 +731,7 @@ module {
     %70 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%68#0, %arg114 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%69 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16>
     %71 = tensor.empty() : tensor<256x256x3x3xf16>
     %72 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg113, %68#0 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%71 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
-    %73 = call @Unknown31(%arg113, %70) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %73 = call @Unknown23(%arg113, %70) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
     %74 = tensor.empty() : tensor<1x256x14x14xf16>
     %75 = tensor.empty() : tensor<256xf32>
     %76 = tensor.empty() : tensor<256xf32>
@@ -529,7 +766,7 @@ module {
     %105 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%103#0, %arg105 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%104 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16>
     %106 = tensor.empty() : tensor<128x128x3x3xf16>
     %107 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg104, %103#0 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%106 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
-    %108 = call @Unknown46(%90, %105, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %108 = call @Unknown38(%90, %105, %arg104) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
     %109 = tensor.empty() : tensor<1x128x28x28xf16>
     %110 = tensor.empty() : tensor<128xf32>
     %111 = tensor.empty() : tensor<128xf32>
@@ -538,7 +775,7 @@ module {
     %114 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%112#0, %arg100 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%113 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16>
     %115 = tensor.empty() : tensor<128x128x3x3xf16>
     %116 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg99, %112#0 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%115 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
-    %117 = call @Unknown50(%arg99, %114) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %117 = call @Unknown42(%arg99, %114) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
     %118 = tensor.empty() : tensor<1x128x28x28xf16>
     %119 = tensor.empty() : tensor<128xf32>
     %120 = tensor.empty() : tensor<128xf32>
@@ -573,7 +810,7 @@ module {
     %149 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%147#0, %arg91 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%148 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16>
     %150 = tensor.empty() : tensor<64x64x3x3xf16>
     %151 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg90, %147#0 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%150 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %152 = call @Unknown65(%134, %149, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %152 = call @Unknown57(%134, %149, %arg90) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
     %153 = tensor.empty() : tensor<1x64x56x56xf16>
     %154 = tensor.empty() : tensor<64xf32>
     %155 = tensor.empty() : tensor<64xf32>
@@ -582,7 +819,7 @@ module {
     %158 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%156#0, %arg88 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%157 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16>
     %159 = tensor.empty() : tensor<64x64x3x3xf16>
     %160 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%arg87, %156#0 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%159 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %161 = call @Unknown69(%arg87, %158) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %161 = call @Unknown61(%arg87, %158) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
     %162 = tensor.empty() : tensor<1x64x56x56xf16>
     %163 = tensor.empty() : tensor<64xf32>
     %164 = tensor.empty() : tensor<64xf32>
@@ -602,34 +839,31 @@ module {
     %178 = tensor.empty() : tensor<64x3x7x7xf16>
     %179 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%arg81, %177#0 : tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>) outs(%178 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16>
     %180 = call @Unknown77(%179) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %181 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1x1000xf32>
-    %182 = tensor.empty() : tensor<1000xf32>
-    %183 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<0> : tensor<1xi64>} ins(%181 : tensor<1x1000xf32>) outs(%182 : tensor<1000xf32>) : tensor<1000xf32>
-    %184 = call @Unknown79(%183) : (tensor<1000xf32>) -> tensor<1000xf32>
+    %181 = call @Unknown78(%arg141) : (tensor<1x1000xf16>) -> tensor<1000xf32>
     %collapsed = tensor.collapse_shape %arg141 [[0, 1]] : tensor<1x1000xf16> into tensor<1000xf16>
     %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<1000xf16> into tensor<1000x1xf16>
-    %185 = tensor.empty() : tensor<1000x512xf16>
-    %186 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%expanded, %arg139 : tensor<1000x1xf16>, tensor<1x512xf16>) outs(%185 : tensor<1000x512xf16>) : tensor<1000x512xf16>
-    %187 = call @Unknown80(%186) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %188 = call @Unknown81(%169) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %189 = call @Unknown82(%160) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %190 = call @Unknown83(%151) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %191 = call @Unknown84(%142) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %192 = call @Unknown85(%125) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %193 = call @Unknown86(%116) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %194 = call @Unknown87(%133) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %195 = call @Unknown88(%107) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %196 = call @Unknown89(%98) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %197 = call @Unknown90(%81) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %198 = call @Unknown91(%72) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %199 = call @Unknown92(%89) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %200 = call @Unknown93(%63) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %201 = call @Unknown94(%54) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %202 = call @Unknown95(%37) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %203 = call @Unknown96(%28) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %204 = call @Unknown97(%45) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %205 = call @Unknown98(%19) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %206 = call @Unknown99(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %177#2, %177#1, %180, %184, %187, %165#2, %165#1, %156#2, %156#1, %188, %189, %147#2, %147#1, %138#2, %138#1, %190, %191, %121#2, %121#1, %112#2, %112#1, %192, %193, %194, %129#2, %129#1, %103#2, %103#1, %94#2, %94#1, %195, %196, %77#2, %77#1, %68#2, %68#1, %197, %198, %199, %85#2, %85#1, %59#2, %59#1, %50#2, %50#1, %200, %201, %33#2, %33#1, %24#2, %24#1, %202, %203, %204, %41#2, %41#1, %15#2, %15#1, %6#2, %6#1, %205, %206 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
-  }
-}
+    %182 = tensor.empty() : tensor<1000x512xf16>
+    %183 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%expanded, %arg139 : tensor<1000x1xf16>, tensor<1x512xf16>) outs(%182 : tensor<1000x512xf16>) : tensor<1000x512xf16>
+    %184 = call @Unknown79(%183) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %185 = call @Unknown80(%169) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %186 = call @Unknown80(%160) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %187 = call @Unknown80(%151) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %188 = call @Unknown80(%142) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %189 = call @Unknown84(%125) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %190 = call @Unknown85(%116) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %191 = call @Unknown86(%133) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %192 = call @Unknown85(%107) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %193 = call @Unknown85(%98) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %194 = call @Unknown89(%81) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %195 = call @Unknown90(%72) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %196 = call @Unknown91(%89) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %197 = call @Unknown90(%63) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %198 = call @Unknown90(%54) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %199 = call @Unknown94(%37) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %200 = call @Unknown95(%28) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %201 = call @Unknown96(%45) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %202 = call @Unknown95(%19) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %203 = call @Unknown95(%10) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    return %177#2, %177#1, %180, %181, %184, %165#2, %165#1, %156#2, %156#1, %185, %186, %147#2, %147#1, %138#2, %138#1, %187, %188, %121#2, %121#1, %112#2, %112#1, %189, %190, %191, %129#2, %129#1, %103#2, %103#1, %94#2, %94#1, %192, %193, %77#2, %77#1, %68#2, %68#1, %194, %195, %196, %85#2, %85#1, %59#2, %59#1, %50#2, %50#1, %197, %198, %33#2, %33#1, %24#2, %24#1, %199, %200, %201, %41#2, %41#1, %15#2, %15#1, %6#2, %6#1, %202, %203 : tensor<64xf32>, tensor<64xf32>, tensor<64x3x7x7xf32>, tensor<1000xf32>, tensor<1000x512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x3x3xf32>, tensor<128x128x3x3xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x3x3xf32>, tensor<256x256x3x3xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x3x3xf32>, tensor<512x512x3x3xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512x512x3x3xf32>
+  }
+}
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir
index de9aece87..f7d58879d 100644
--- a/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/5_affine_opt.mlir
@@ -2,424 +2,563 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#map3 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
 module {
   func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.900000e+01 : f16
     %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<1x512x7x7xf16>, memref<1x512xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %0 = arith.divf %in_1, %cst : f16
-      %1 = arith.cmpf ogt, %in, %cst_0 : f16
-      %2 = arith.select %1, %0, %cst_0 : f16
-      linalg.yield %2 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_1 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %out: f16):
+            %0 = arith.divf %in, %cst : f16
+            %1 = arith.cmpf ogt, %in_3, %cst_0 : f16
+            %2 = arith.select %1, %0, %cst_0 : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c512 step %c1 {
+      scf.for %arg4 = %c0 to %c7 step %c1 {
+        scf.for %arg5 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c256 step %c1 {
+      scf.for %arg4 = %c0 to %c14 step %c1 {
+        scf.for %arg5 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      scf.for %arg3 = %c0 to %c14 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c128 step %c1 {
+      scf.for %arg4 = %c0 to %c28 step %c1 {
+        scf.for %arg5 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      scf.for %arg3 = %c0 to %c28 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c64 step %c1 {
+      scf.for %arg4 = %c0 to %c56 step %c1 {
+        scf.for %arg5 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x112x112xf16>, memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c112 step %c1 {
+        scf.for %arg4 = %c0 to %c112 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<1x1000xf32>
-    linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x1000xf16>) outs(%alloc : memref<1x1000xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<1x1000xf32>
-  }
-  func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<1000xf32>
-    linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %0 = arith.truncf %in : f32 to f16
-      %1 = arith.extf %0 : f16 to f32
-      linalg.yield %1 : f32
-    }
-    return %alloc : memref<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f32):
+        %0 = arith.extf %in : f16 to f32
+        %1 = arith.truncf %0 : f32 to f16
+        %2 = arith.extf %1 : f16 to f32
+        linalg.yield %2 : f32
+      }
+    }
+    %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32>
+    return %collapse_shape : memref<1000xf32>
+  }
+  func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
   func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} {
     %alloc = memref.alloc() : memref<1x512xf16>
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16>
@@ -450,7 +589,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_14 = memref.alloc() : memref<512x512x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>
-    %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
     %alloc_15 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_16 = memref.alloc() : memref<512xf32>
     %alloc_17 = memref.alloc() : memref<512xf32>
@@ -485,7 +624,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_34 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_35 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_36 = memref.alloc() : memref<256xf32>
     %alloc_37 = memref.alloc() : memref<256xf32>
@@ -494,7 +633,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_39 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_41 = memref.alloc() : memref<256xf32>
     %alloc_42 = memref.alloc() : memref<256xf32>
@@ -529,7 +668,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_59 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_60 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_61 = memref.alloc() : memref<128xf32>
     %alloc_62 = memref.alloc() : memref<128xf32>
@@ -538,7 +677,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_64 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_65 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_66 = memref.alloc() : memref<128xf32>
     %alloc_67 = memref.alloc() : memref<128xf32>
@@ -573,7 +712,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_84 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_85 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_86 = memref.alloc() : memref<64xf32>
     %alloc_87 = memref.alloc() : memref<64xf32>
@@ -582,7 +721,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_89 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_90 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_91 = memref.alloc() : memref<64xf32>
     %alloc_92 = memref.alloc() : memref<64xf32>
@@ -602,34 +741,31 @@ module {
     %alloc_99 = memref.alloc() : memref<64x3x7x7xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16>
     %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32>
-    %alloc_100 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32>
-    %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32>
+    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32>
     %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16>
     %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16>
-    %alloc_101 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
-    %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
+    %alloc_100 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
+    %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir
index 5206f661d..ef34370db 100644
--- a/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/5_alternative_scf_opt.mlir
@@ -2,424 +2,563 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0, d1)>
-#map3 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
 module {
   func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.900000e+01 : f16
     %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<1x512x7x7xf16>, memref<1x512xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16):
-      %0 = arith.divf %in_1, %cst : f16
-      %1 = arith.cmpf ogt, %in, %cst_0 : f16
-      %2 = arith.select %1, %0, %cst_0 : f16
-      linalg.yield %2 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_1 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %out: f16):
+            %0 = arith.divf %in, %cst : f16
+            %1 = arith.cmpf ogt, %in_3, %cst_0 : f16
+            %2 = arith.select %1, %0, %cst_0 : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c512 step %c1 {
+      scf.for %arg4 = %c0 to %c7 step %c1 {
+        scf.for %arg5 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c256 step %c1 {
+      scf.for %arg4 = %c0 to %c14 step %c1 {
+        scf.for %arg5 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      scf.for %arg3 = %c0 to %c14 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c128 step %c1 {
+      scf.for %arg4 = %c0 to %c28 step %c1 {
+        scf.for %arg5 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      scf.for %arg3 = %c0 to %c28 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
+    scf.for %arg3 = %c0 to %c64 step %c1 {
+      scf.for %arg4 = %c0 to %c56 step %c1 {
+        scf.for %arg5 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_2 = memref.subview %arg2[0, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_3: f16, %in_4: f16, %out: f16):
+            %0 = arith.addf %in, %in_3 : f16
+            %1 = arith.cmpf ogt, %in_4, %cst : f16
+            %2 = arith.select %1, %0, %cst : f16
+            linalg.yield %2 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.cmpf ogt, %in, %cst : f16
-      %2 = arith.select %1, %0, %cst : f16
-      linalg.yield %2 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x112x112xf16>, memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.cmpf ogt, %in, %cst : f16
-      %1 = arith.select %0, %in_0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c112 step %c1 {
+        scf.for %arg4 = %c0 to %c112 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.cmpf ogt, %in, %cst : f16
+            %1 = arith.select %0, %in_2, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<1x1000xf32>
-    linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x1000xf16>) outs(%alloc : memref<1x1000xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<1x1000xf32>
-  }
-  func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<1000xf32>
-    linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %0 = arith.truncf %in : f32 to f16
-      %1 = arith.extf %0 : f16 to f32
-      linalg.yield %1 : f32
-    }
-    return %alloc : memref<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x1000xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f32):
+        %0 = arith.extf %in : f16 to f32
+        %1 = arith.truncf %0 : f32 to f16
+        %2 = arith.extf %1 : f16 to f32
+        linalg.yield %2 : f32
+      }
+    }
+    %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32>
+    return %collapse_shape : memref<1000xf32>
+  }
+  func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {minor_to_major = dense<[0, 1, 3, 2]> : tensor<4xindex>} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
   func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} {
     %alloc = memref.alloc() : memref<1x512xf16>
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16>
@@ -450,7 +589,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_14 = memref.alloc() : memref<512x512x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>
-    %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
     %alloc_15 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_16 = memref.alloc() : memref<512xf32>
     %alloc_17 = memref.alloc() : memref<512xf32>
@@ -485,7 +624,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_34 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_35 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_36 = memref.alloc() : memref<256xf32>
     %alloc_37 = memref.alloc() : memref<256xf32>
@@ -494,7 +633,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_39 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_41 = memref.alloc() : memref<256xf32>
     %alloc_42 = memref.alloc() : memref<256xf32>
@@ -529,7 +668,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_59 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_60 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_61 = memref.alloc() : memref<128xf32>
     %alloc_62 = memref.alloc() : memref<128xf32>
@@ -538,7 +677,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_64 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_65 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_66 = memref.alloc() : memref<128xf32>
     %alloc_67 = memref.alloc() : memref<128xf32>
@@ -573,7 +712,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_84 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_85 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_86 = memref.alloc() : memref<64xf32>
     %alloc_87 = memref.alloc() : memref<64xf32>
@@ -582,7 +721,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_89 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_90 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_91 = memref.alloc() : memref<64xf32>
     %alloc_92 = memref.alloc() : memref<64xf32>
@@ -602,34 +741,31 @@ module {
     %alloc_99 = memref.alloc() : memref<64x3x7x7xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16>
     %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32>
-    %alloc_100 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32>
-    %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32>
+    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32>
     %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16>
     %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16>
-    %alloc_101 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
-    %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
+    %alloc_100 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
+    %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir
index afd93d077..83a5ea971 100644
--- a/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/6_gpu_opt.mlir
@@ -4,1571 +4,468 @@
 
 module {
   func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant 4.900000e+01 : f16
-    %c0 = arith.constant 0 : index
     %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
     scf.for %arg2 = %c0 to %c25088 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg0[%c0, %19] : memref<1x512xf16>
-      %22 = arith.divf %21, %cst_0 : f16
-      %23 = arith.cmpf ogt, %20, %cst : f16
-      %24 = arith.select %23, %22, %cst : f16
-      memref.store %24, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = memref.load %arg0[%c0, %3] : memref<1x512xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %6 = arith.divf %4, %cst_0 : f16
+      %7 = arith.cmpf ogt, %5, %cst : f16
+      %8 = arith.select %7, %6, %cst : f16
+      memref.store %8, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c7 = arith.constant 7 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
     scf.for %arg2 = %c0 to %c25088 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %6 = arith.cmpf ogt, %4, %cst : f16
+      %7 = arith.select %6, %5, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c7 = arith.constant 7 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
     scf.for %arg3 = %c0 to %c25088 step %c1 {
       %0 = arith.remsi %arg3, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    scf.for %arg2 = %c0 to %c25088 step %c1 {
-      %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
+      %1 = arith.divsi %arg3, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %7 = arith.addf %4, %5 : f16
+      %8 = arith.cmpf ogt, %6, %cst : f16
+      %9 = arith.select %8, %7, %cst : f16
+      memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c14 = arith.constant 14 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
     scf.for %arg3 = %c0 to %c50176 step %c1 {
       %0 = arith.remsi %arg3, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
+      %1 = arith.divsi %arg3, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %7 = arith.addf %4, %5 : f16
+      %8 = arith.cmpf ogt, %6, %cst : f16
+      %9 = arith.select %8, %7, %cst : f16
+      memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
     %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    scf.for %arg2 = %c0 to %c50176 step %c1 {
-      %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c50176 = arith.constant 50176 : index
     %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    scf.for %arg3 = %c0 to %c50176 step %c1 {
-      %0 = arith.remsi %arg3, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
     scf.for %arg2 = %c0 to %c50176 step %c1 {
       %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
+      %1 = arith.divsi %arg2, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %6 = arith.cmpf ogt, %4, %cst : f16
+      %7 = arith.select %6, %5, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c28 = arith.constant 28 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
     scf.for %arg3 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg3, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
+      %1 = arith.divsi %arg3, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %7 = arith.addf %4, %5 : f16
+      %8 = arith.cmpf ogt, %6, %cst : f16
+      %9 = arith.select %8, %7, %cst : f16
+      memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    scf.for %arg2 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
     %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    scf.for %arg3 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg3, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
     scf.for %arg2 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
+      %1 = arith.divsi %arg2, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %6 = arith.cmpf ogt, %4, %cst : f16
+      %7 = arith.select %6, %5, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c56 = arith.constant 56 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
     scf.for %arg3 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg3, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
+      %1 = arith.divsi %arg3, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %6 = memref.load %arg2[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %7 = arith.addf %4, %5 : f16
+      %8 = arith.cmpf ogt, %6, %cst : f16
+      %9 = arith.select %8, %7, %cst : f16
+      memref.store %9, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    scf.for %arg2 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
     %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    scf.for %arg3 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg3, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg2[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %23 = arith.addf %21, %22 : f16
-      %24 = arith.cmpf ogt, %20, %cst : f16
-      %25 = arith.select %24, %23, %cst : f16
-      memref.store %25, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
     scf.for %arg2 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %6 = arith.cmpf ogt, %4, %cst : f16
+      %7 = arith.select %6, %5, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
     scf.for %arg2 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = arith.addf %20, %21 : f16
-      memref.store %22, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %6 = arith.addf %4, %5 : f16
+      memref.store %6, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c112 = arith.constant 112 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
-    %c112 = arith.constant 112 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
     scf.for %arg2 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg2, %c112 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c112 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c112 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c112 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c112 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c112 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x112x112xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x112x112xf16>
-      %22 = arith.cmpf ogt, %20, %cst : f16
-      %23 = arith.select %22, %21, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x112x112xf16>
+      %1 = arith.divsi %arg2, %c112 : index
+      %2 = arith.remsi %1, %c112 : index
+      %3 = arith.divsi %1, %c112 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x112x112xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x112x112xf16>
+      %6 = arith.cmpf ogt, %4, %cst : f16
+      %7 = arith.select %6, %5, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x112x112xf16>
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c9408 = arith.constant 9408 : index
-    %c1 = arith.constant 1 : index
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c9408 = arith.constant 9408 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
     scf.for %arg1 = %c0 to %c9408 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c3 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c3 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c3 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf32>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c3 : index
+      %5 = arith.divsi %3, %c3 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf32>
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
+  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
     %c1000 = arith.constant 1000 : index
     %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %alloc = memref.alloc() : memref<1x1000xf32>
     scf.for %arg1 = %c0 to %c1000 step %c1 {
       %0 = memref.load %arg0[%c0, %arg1] : memref<1x1000xf16>
       %1 = arith.extf %0 : f16 to f32
-      memref.store %1, %alloc[%c0, %arg1] : memref<1x1000xf32>
+      %2 = arith.truncf %1 : f32 to f16
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%c0, %arg1] : memref<1x1000xf32>
     }
-    return %alloc : memref<1x1000xf32>
+    %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32>
+    return %collapse_shape : memref<1000xf32>
   }
-  func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c1000 = arith.constant 1000 : index
+  func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<1000xf32>
-    scf.for %arg1 = %c0 to %c1000 step %c1 {
-      %0 = memref.load %arg0[%arg1] : memref<1000xf32>
-      %1 = arith.truncf %0 : f32 to f16
-      %2 = arith.extf %1 : f16 to f32
-      memref.store %2, %alloc[%arg1] : memref<1000xf32>
-    }
-    return %alloc : memref<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c512000 = arith.constant 512000 : index
-    %c1 = arith.constant 1 : index
-    %c512 = arith.constant 512 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
     scf.for %arg1 = %c0 to %c512000 step %c1 {
       %0 = arith.remsi %arg1, %c512 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c512 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c512 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<1000x512xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3] : memref<1000x512xf32>
+      %1 = arith.divsi %arg1, %c512 : index
+      %2 = memref.load %arg0[%1, %0] : memref<1000x512xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0] : memref<1000x512xf32>
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
     scf.for %arg1 = %c0 to %c36864 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf32>
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c73728 = arith.constant 73728 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
     scf.for %arg1 = %c0 to %c73728 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf32>
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c147456 = arith.constant 147456 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
     scf.for %arg1 = %c0 to %c147456 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf32>
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c8192 = arith.constant 8192 : index
-    %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
     scf.for %arg1 = %c0 to %c8192 step %c1 {
       %0 = arith.remsi %arg1, %c64 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c64 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c64 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf32>
+      %1 = arith.divsi %arg1, %c64 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf32>
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c294912 = arith.constant 294912 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
     scf.for %arg1 = %c0 to %c294912 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf32>
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
+    %c589824 = arith.constant 589824 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
     scf.for %arg1 = %c0 to %c589824 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf32>
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c32768 = arith.constant 32768 : index
-    %c1 = arith.constant 1 : index
-    %c128 = arith.constant 128 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
     scf.for %arg1 = %c0 to %c32768 step %c1 {
       %0 = arith.remsi %arg1, %c128 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c128 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c128 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf32>
+      %1 = arith.divsi %arg1, %c128 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf32>
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c1179648 = arith.constant 1179648 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
     scf.for %arg1 = %c0 to %c1179648 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf32>
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
+    %c2359296 = arith.constant 2359296 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
     scf.for %arg1 = %c0 to %c2359296 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf32>
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c131072 = arith.constant 131072 : index
-    %c1 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
     scf.for %arg1 = %c0 to %c131072 step %c1 {
       %0 = arith.remsi %arg1, %c256 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c256 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c256 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf32>
+      %1 = arith.divsi %arg1, %c256 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf32>
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
   func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} {
     %alloc = memref.alloc() : memref<1x512xf16>
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16>
@@ -1599,7 +496,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_14 = memref.alloc() : memref<512x512x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>
-    %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
     %alloc_15 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_16 = memref.alloc() : memref<512xf32>
     %alloc_17 = memref.alloc() : memref<512xf32>
@@ -1634,7 +531,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_34 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_35 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_36 = memref.alloc() : memref<256xf32>
     %alloc_37 = memref.alloc() : memref<256xf32>
@@ -1643,7 +540,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_39 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_41 = memref.alloc() : memref<256xf32>
     %alloc_42 = memref.alloc() : memref<256xf32>
@@ -1678,7 +575,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_59 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_60 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_61 = memref.alloc() : memref<128xf32>
     %alloc_62 = memref.alloc() : memref<128xf32>
@@ -1687,7 +584,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_64 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_65 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_66 = memref.alloc() : memref<128xf32>
     %alloc_67 = memref.alloc() : memref<128xf32>
@@ -1722,7 +619,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_84 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_85 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_86 = memref.alloc() : memref<64xf32>
     %alloc_87 = memref.alloc() : memref<64xf32>
@@ -1731,7 +628,7 @@ module {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_89 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_90 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_91 = memref.alloc() : memref<64xf32>
     %alloc_92 = memref.alloc() : memref<64xf32>
@@ -1751,34 +648,31 @@ module {
     %alloc_99 = memref.alloc() : memref<64x3x7x7xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16>
     %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32>
-    %alloc_100 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32>
-    %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32>
+    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32>
     %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16>
     %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16>
-    %alloc_101 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
-    %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
+    %alloc_100 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
+    %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir
index 6d46f31c1..043350d4b 100644
--- a/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/7_set_space_opt.mlir
@@ -1,918 +1,253 @@
-// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
+// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
 
 // CHECK-LABEL: func.func @main
 
 module attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
       %c131072 = arith.constant 131072 : index
+      %c0 = arith.constant 0 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
       %c32768 = arith.constant 32768 : index
+      %c0 = arith.constant 0 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
+      %c73728 = arith.constant 73728 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
-      %c1000 = arith.constant 1000 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
@@ -924,1144 +259,517 @@ module attributes {gpu.container_module} {
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16>
-        %7 = arith.extf %6 : f16 to f32
-        memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16>
+        %8 = arith.extf %7 : f16 to f32
+        %9 = arith.truncf %8 : f32 to f16
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32>
       }
       gpu.return
     }
     gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
     gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
       %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16>
-        %28 = arith.divf %27, %cst_0 : f16
-        %29 = arith.cmpf ogt, %26, %cst : f16
-        %30 = arith.select %29, %28, %cst : f16
-        memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.divf %11, %cst : f16
+        %14 = arith.cmpf ogt, %12, %cst_0 : f16
+        %15 = arith.select %14, %13, %cst_0 : f16
+        memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
   }
-  func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c25 = arith.constant 25 : index
     %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown0 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown4 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown0 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c25 = arith.constant 25 : index
     %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown8 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %arg2 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown4 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c25 = arith.constant 25 : index
     %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown12 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown8 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %arg2 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c49 = arith.constant 49 : index
     %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown19 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown23 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown27 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
+    gpu.launch_func  @unified::@Unknown19 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %arg2 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
     return %alloc : memref<1x256x14x14xf16>
   }
-  func.func private @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown31", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c49 = arith.constant 49 : index
     %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown31 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
+    gpu.launch_func  @unified::@Unknown23 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
     return %alloc : memref<1x256x14x14xf16>
   }
-  func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown38 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown42 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
+    gpu.launch_func  @unified::@Unknown38 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
     return %alloc : memref<1x128x28x28xf16>
   }
-  func.func private @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown46 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %arg2 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown50 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
+    gpu.launch_func  @unified::@Unknown42 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
     return %alloc : memref<1x128x28x28xf16>
   }
-  func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown57 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown61 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown65 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown57 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %arg2 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown69 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown61 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown73 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown73 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    gpu.launch_func  @unified::@Unknown74 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %arg1 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>)
+    gpu.launch_func  @unified::@Unknown74 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x112x112xf16>, %arg1 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>)
     return %alloc : memref<1x64x112x112xf16>
   }
-  func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown77(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c10 = arith.constant 10 : index
     %c1 = arith.constant 1 : index
-    %c74 = arith.constant 74 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    gpu.launch_func  @unified::@Unknown77 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>)
+    gpu.launch_func  @unified::@Unknown77 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>)
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1x1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown78(%arg0: memref<1x1000xf16>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1x1000xf32>
-    gpu.launch_func  @unified::@Unknown78 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x1000xf16>, %alloc : memref<1x1000xf32>)
-    return %alloc : memref<1x1000xf32>
+    %collapse_shape = memref.collapse_shape %alloc [[0, 1]] : memref<1x1000xf32> into memref<1000xf32>
+    gpu.launch_func  @unified::@Unknown78 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x1000xf16>, %alloc : memref<1x1000xf32>)
+    return %collapse_shape : memref<1000xf32>
   }
-  func.func private @Unknown79(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown79(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c500 = arith.constant 500 : index
     %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
-    %alloc = memref.alloc() : memref<1000xf32>
-    gpu.launch_func  @unified::@Unknown79 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>)
-    return %alloc : memref<1000xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4000 = arith.constant 4000 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    gpu.launch_func  @unified::@Unknown80 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>)
+    gpu.launch_func  @unified::@Unknown79 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>)
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown81(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown80(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c36 = arith.constant 36 : index
     %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown81 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown82(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown82 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown83(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown83 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown84 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown80 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown85(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown84(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c72 = arith.constant 72 : index
     %c1 = arith.constant 1 : index
-    %c576 = arith.constant 576 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown85 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown84 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>)
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown86(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown85(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c144 = arith.constant 144 : index
     %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown86 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown85 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown87(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown86(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c8 = arith.constant 8 : index
     %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    gpu.launch_func  @unified::@Unknown87 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown86 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>)
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown88(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown88 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown89 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown89(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c288 = arith.constant 288 : index
     %c1 = arith.constant 1 : index
-    %c2304 = arith.constant 2304 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown90 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown89 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>)
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown91(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown90(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c576 = arith.constant 576 : index
     %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown91 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown90 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown92(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown91(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c32 = arith.constant 32 : index
     %c1 = arith.constant 1 : index
     %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    gpu.launch_func  @unified::@Unknown92 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown91 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>)
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown93(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown93 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown94 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown94(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1152 = arith.constant 1152 : index
     %c1 = arith.constant 1 : index
-    %c9216 = arith.constant 9216 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown95 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown94 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>)
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown96(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown95(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c2304 = arith.constant 2304 : index
     %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown96 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown95 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown97(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown96(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %c1024 = arith.constant 1024 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    gpu.launch_func  @unified::@Unknown97 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown96 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>)
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown98(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown98 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown99 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
-    return %alloc : memref<512x512x3x3xf32>
-  }
   func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<128xf32>, %arg11: memref<128xf32>, %arg12: memref<128xf32>, %arg13: memref<128xf32>, %arg14: memref<128xf32>, %arg15: memref<128xf32>, %arg16: memref<128xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<256xf32>, %arg21: memref<256xf32>, %arg22: memref<256xf32>, %arg23: memref<256xf32>, %arg24: memref<256xf32>, %arg25: memref<256xf32>, %arg26: memref<256xf32>, %arg27: memref<256xf32>, %arg28: memref<256xf32>, %arg29: memref<256xf32>, %arg30: memref<512xf32>, %arg31: memref<512xf32>, %arg32: memref<512xf32>, %arg33: memref<512xf32>, %arg34: memref<512xf32>, %arg35: memref<512xf32>, %arg36: memref<512xf32>, %arg37: memref<512xf32>, %arg38: memref<512xf32>, %arg39: memref<512xf32>, %arg40: memref<64xf32>, %arg41: memref<64xf32>, %arg42: memref<64xf32>, %arg43: memref<64xf32>, %arg44: memref<64xf32>, %arg45: memref<64xf32>, %arg46: memref<64xf32>, %arg47: memref<64xf32>, %arg48: memref<64xf32>, %arg49: memref<64xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<128xf32>, %arg53: memref<128xf32>, %arg54: memref<128xf32>, %arg55: memref<128xf32>, %arg56: memref<128xf32>, %arg57: memref<128xf32>, %arg58: memref<128xf32>, %arg59: memref<128xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<512xf32>, %arg71: memref<512xf32>, %arg72: memref<512xf32>, %arg73: memref<512xf32>, %arg74: memref<512xf32>, %arg75: memref<512xf32>, %arg76: memref<512xf32>, %arg77: memref<512xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<64x3x7x7xf16>, %arg81: memref<1x3x224x224xf16>, %arg82: memref<1x64x112x112xf16>, %arg83: memref<1x64x112x112xf16>, %arg84: memref<1x64x56x56xf16>, %arg85: memref<64x64x3x3xf16>, %arg86: memref<1x64x56x56xf16>, %arg87: memref<1x64x56x56xf16>, %arg88: memref<64x64x3x3xf16>, %arg89: memref<1x64x56x56xf16>, %arg90: memref<1x64x56x56xf16>, %arg91: memref<64x64x3x3xf16>, %arg92: memref<1x64x56x56xf16>, %arg93: memref<1x64x56x56xf16>, %arg94: memref<64x64x3x3xf16>, %arg95: memref<1x64x56x56xf16>, %arg96: memref<1x64x56x56xf16>, %arg97: memref<128x64x3x3xf16>, %arg98: memref<1x128x28x28xf16>, %arg99: memref<1x128x28x28xf16>, %arg100: memref<128x128x3x3xf16>, %arg101: memref<1x128x28x28xf16>, %arg102: memref<128x64x1x1xf16>, %arg103: memref<1x128x28x28xf16>, %arg104: memref<1x128x28x28xf16>, %arg105: memref<128x128x3x3xf16>, %arg106: memref<1x128x28x28xf16>, %arg107: memref<1x128x28x28xf16>, %arg108: memref<128x128x3x3xf16>, %arg109: memref<1x128x28x28xf16>, %arg110: memref<1x128x28x28xf16>, %arg111: memref<256x128x3x3xf16>, %arg112: memref<1x256x14x14xf16>, %arg113: memref<1x256x14x14xf16>, %arg114: memref<256x256x3x3xf16>, %arg115: memref<1x256x14x14xf16>, %arg116: memref<256x128x1x1xf16>, %arg117: memref<1x256x14x14xf16>, %arg118: memref<1x256x14x14xf16>, %arg119: memref<256x256x3x3xf16>, %arg120: memref<1x256x14x14xf16>, %arg121: memref<1x256x14x14xf16>, %arg122: memref<256x256x3x3xf16>, %arg123: memref<1x256x14x14xf16>, %arg124: memref<1x256x14x14xf16>, %arg125: memref<512x256x3x3xf16>, %arg126: memref<1x512x7x7xf16>, %arg127: memref<1x512x7x7xf16>, %arg128: memref<512x512x3x3xf16>, %arg129: memref<1x512x7x7xf16>, %arg130: memref<512x256x1x1xf16>, %arg131: memref<1x512x7x7xf16>, %arg132: memref<1x512x7x7xf16>, %arg133: memref<512x512x3x3xf16>, %arg134: memref<1x512x7x7xf16>, %arg135: memref<1x512x7x7xf16>, %arg136: memref<512x512x3x3xf16>, %arg137: memref<1x512x7x7xf16>, %arg138: memref<1x512x7x7xf16>, %arg139: memref<1x512xf16>, %arg140: memref<512x1000xf16>, %arg141: memref<1x1000xf16>) -> (memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>) attributes {__placeholder__byre.entry_point} {
     %alloc = memref.alloc() : memref<1x512xf16>
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16>, memref<512x1000xf16>, memref<1x512xf16>
@@ -2092,7 +800,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_14 = memref.alloc() : memref<512x512x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>
-    %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
     %alloc_15 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_16 = memref.alloc() : memref<512xf32>
     %alloc_17 = memref.alloc() : memref<512xf32>
@@ -2127,7 +835,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_34 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_35 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_36 = memref.alloc() : memref<256xf32>
     %alloc_37 = memref.alloc() : memref<256xf32>
@@ -2136,7 +844,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_39 = memref.alloc() : memref<256x256x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>
-    %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_41 = memref.alloc() : memref<256xf32>
     %alloc_42 = memref.alloc() : memref<256xf32>
@@ -2171,7 +879,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_59 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_60 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_61 = memref.alloc() : memref<128xf32>
     %alloc_62 = memref.alloc() : memref<128xf32>
@@ -2180,7 +888,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_64 = memref.alloc() : memref<128x128x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>
-    %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %alloc_65 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_66 = memref.alloc() : memref<128xf32>
     %alloc_67 = memref.alloc() : memref<128xf32>
@@ -2215,7 +923,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_84 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_85 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_86 = memref.alloc() : memref<64xf32>
     %alloc_87 = memref.alloc() : memref<64xf32>
@@ -2224,7 +932,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_89 = memref.alloc() : memref<64x64x3x3xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>
-    %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %alloc_90 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_91 = memref.alloc() : memref<64xf32>
     %alloc_92 = memref.alloc() : memref<64xf32>
@@ -2244,34 +952,31 @@ module attributes {gpu.container_module} {
     %alloc_99 = memref.alloc() : memref<64x3x7x7xf16>
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<64x3x7x7xf16>
     %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1x1000xf32>
-    %alloc_100 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32>, memref<1000xf32>
-    %20 = call @Unknown79(%alloc_100) : (memref<1000xf32>) -> memref<1000xf32>
+    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16>) -> memref<1000xf32>
     %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16> into memref<1000xf16>
     %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16> into memref<1000x1xf16>
-    %alloc_101 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
-    %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
+    %alloc_100 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16>, memref<1x512xf16>, memref<1000x512xf16>
+    %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32>, memref<64xf32>, memref<64x3x7x7xf32>, memref<1000xf32>, memref<1000x512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x3x3xf32>, memref<128x128x3x3xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x3x3xf32>, memref<256x256x3x3xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x3x3xf32>, memref<512x512x3x3xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512x512x3x3xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir
index 67dcb06aa..e22e39fa7 100644
--- a/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir
+++ b/compiler/test/E2E/ResNet18/BW/8_byre_opt.mlir
@@ -4,915 +4,250 @@
 
 module attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
       %c131072 = arith.constant 131072 : index
+      %c0 = arith.constant 0 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
       %c32768 = arith.constant 32768 : index
+      %c0 = arith.constant 0 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
-      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
@@ -924,857 +259,349 @@ module attributes {gpu.container_module} {
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16>
-        %7 = arith.extf %6 : f16 to f32
-        memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16>
+        %8 = arith.extf %7 : f16 to f32
+        %9 = arith.truncf %8 : f32 to f16
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32>
       }
       gpu.return
     }
     gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
     gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
       %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16>
-        %28 = arith.divf %27, %cst_0 : f16
-        %29 = arith.cmpf ogt, %26, %cst : f16
-        %30 = arith.select %29, %28, %cst : f16
-        memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.divf %11, %cst : f16
+        %14 = arith.cmpf ogt, %12, %cst_0 : f16
+        %15 = arith.select %14, %13, %cst_0 : f16
+        memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
   }
-  func.func private @Unknown0(memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown4(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown8(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown12(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown19(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown23(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown27(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown31(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown31", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown38(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown42(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown46(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown50(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown57(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown61(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown65(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown69(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown73(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown74(memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown77(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown78(memref<1x1000xf16, "cuda">) -> memref<1x1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown79(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown80(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown81(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown82(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown83(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown84(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown85(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown86(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown87(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown88(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown89(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown90(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown91(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown92(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown93(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown94(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown95(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown96(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown97(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown98(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown99(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown0(memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown4(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown8(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown19(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown23(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown38(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown42(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown42", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown57(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown61(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown73(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown74(memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown77(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown78(memref<1x1000xf16, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown79(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown80(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown84(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown85(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown86(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown89(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown90(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown91(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown94(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown95(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown96(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
   func.func @main(%arg0: memref<64xf32, "cuda">, %arg1: memref<64xf32, "cuda">, %arg2: memref<64xf32, "cuda">, %arg3: memref<64xf32, "cuda">, %arg4: memref<64xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64xf32, "cuda">, %arg10: memref<128xf32, "cuda">, %arg11: memref<128xf32, "cuda">, %arg12: memref<128xf32, "cuda">, %arg13: memref<128xf32, "cuda">, %arg14: memref<128xf32, "cuda">, %arg15: memref<128xf32, "cuda">, %arg16: memref<128xf32, "cuda">, %arg17: memref<128xf32, "cuda">, %arg18: memref<128xf32, "cuda">, %arg19: memref<128xf32, "cuda">, %arg20: memref<256xf32, "cuda">, %arg21: memref<256xf32, "cuda">, %arg22: memref<256xf32, "cuda">, %arg23: memref<256xf32, "cuda">, %arg24: memref<256xf32, "cuda">, %arg25: memref<256xf32, "cuda">, %arg26: memref<256xf32, "cuda">, %arg27: memref<256xf32, "cuda">, %arg28: memref<256xf32, "cuda">, %arg29: memref<256xf32, "cuda">, %arg30: memref<512xf32, "cuda">, %arg31: memref<512xf32, "cuda">, %arg32: memref<512xf32, "cuda">, %arg33: memref<512xf32, "cuda">, %arg34: memref<512xf32, "cuda">, %arg35: memref<512xf32, "cuda">, %arg36: memref<512xf32, "cuda">, %arg37: memref<512xf32, "cuda">, %arg38: memref<512xf32, "cuda">, %arg39: memref<512xf32, "cuda">, %arg40: memref<64xf32, "cuda">, %arg41: memref<64xf32, "cuda">, %arg42: memref<64xf32, "cuda">, %arg43: memref<64xf32, "cuda">, %arg44: memref<64xf32, "cuda">, %arg45: memref<64xf32, "cuda">, %arg46: memref<64xf32, "cuda">, %arg47: memref<64xf32, "cuda">, %arg48: memref<64xf32, "cuda">, %arg49: memref<64xf32, "cuda">, %arg50: memref<128xf32, "cuda">, %arg51: memref<128xf32, "cuda">, %arg52: memref<128xf32, "cuda">, %arg53: memref<128xf32, "cuda">, %arg54: memref<128xf32, "cuda">, %arg55: memref<128xf32, "cuda">, %arg56: memref<128xf32, "cuda">, %arg57: memref<128xf32, "cuda">, %arg58: memref<128xf32, "cuda">, %arg59: memref<128xf32, "cuda">, %arg60: memref<256xf32, "cuda">, %arg61: memref<256xf32, "cuda">, %arg62: memref<256xf32, "cuda">, %arg63: memref<256xf32, "cuda">, %arg64: memref<256xf32, "cuda">, %arg65: memref<256xf32, "cuda">, %arg66: memref<256xf32, "cuda">, %arg67: memref<256xf32, "cuda">, %arg68: memref<256xf32, "cuda">, %arg69: memref<256xf32, "cuda">, %arg70: memref<512xf32, "cuda">, %arg71: memref<512xf32, "cuda">, %arg72: memref<512xf32, "cuda">, %arg73: memref<512xf32, "cuda">, %arg74: memref<512xf32, "cuda">, %arg75: memref<512xf32, "cuda">, %arg76: memref<512xf32, "cuda">, %arg77: memref<512xf32, "cuda">, %arg78: memref<512xf32, "cuda">, %arg79: memref<512xf32, "cuda">, %arg80: memref<64x3x7x7xf16, "cuda">, %arg81: memref<1x3x224x224xf16, "cuda">, %arg82: memref<1x64x112x112xf16, "cuda">, %arg83: memref<1x64x112x112xf16, "cuda">, %arg84: memref<1x64x56x56xf16, "cuda">, %arg85: memref<64x64x3x3xf16, "cuda">, %arg86: memref<1x64x56x56xf16, "cuda">, %arg87: memref<1x64x56x56xf16, "cuda">, %arg88: memref<64x64x3x3xf16, "cuda">, %arg89: memref<1x64x56x56xf16, "cuda">, %arg90: memref<1x64x56x56xf16, "cuda">, %arg91: memref<64x64x3x3xf16, "cuda">, %arg92: memref<1x64x56x56xf16, "cuda">, %arg93: memref<1x64x56x56xf16, "cuda">, %arg94: memref<64x64x3x3xf16, "cuda">, %arg95: memref<1x64x56x56xf16, "cuda">, %arg96: memref<1x64x56x56xf16, "cuda">, %arg97: memref<128x64x3x3xf16, "cuda">, %arg98: memref<1x128x28x28xf16, "cuda">, %arg99: memref<1x128x28x28xf16, "cuda">, %arg100: memref<128x128x3x3xf16, "cuda">, %arg101: memref<1x128x28x28xf16, "cuda">, %arg102: memref<128x64x1x1xf16, "cuda">, %arg103: memref<1x128x28x28xf16, "cuda">, %arg104: memref<1x128x28x28xf16, "cuda">, %arg105: memref<128x128x3x3xf16, "cuda">, %arg106: memref<1x128x28x28xf16, "cuda">, %arg107: memref<1x128x28x28xf16, "cuda">, %arg108: memref<128x128x3x3xf16, "cuda">, %arg109: memref<1x128x28x28xf16, "cuda">, %arg110: memref<1x128x28x28xf16, "cuda">, %arg111: memref<256x128x3x3xf16, "cuda">, %arg112: memref<1x256x14x14xf16, "cuda">, %arg113: memref<1x256x14x14xf16, "cuda">, %arg114: memref<256x256x3x3xf16, "cuda">, %arg115: memref<1x256x14x14xf16, "cuda">, %arg116: memref<256x128x1x1xf16, "cuda">, %arg117: memref<1x256x14x14xf16, "cuda">, %arg118: memref<1x256x14x14xf16, "cuda">, %arg119: memref<256x256x3x3xf16, "cuda">, %arg120: memref<1x256x14x14xf16, "cuda">, %arg121: memref<1x256x14x14xf16, "cuda">, %arg122: memref<256x256x3x3xf16, "cuda">, %arg123: memref<1x256x14x14xf16, "cuda">, %arg124: memref<1x256x14x14xf16, "cuda">, %arg125: memref<512x256x3x3xf16, "cuda">, %arg126: memref<1x512x7x7xf16, "cuda">, %arg127: memref<1x512x7x7xf16, "cuda">, %arg128: memref<512x512x3x3xf16, "cuda">, %arg129: memref<1x512x7x7xf16, "cuda">, %arg130: memref<512x256x1x1xf16, "cuda">, %arg131: memref<1x512x7x7xf16, "cuda">, %arg132: memref<1x512x7x7xf16, "cuda">, %arg133: memref<512x512x3x3xf16, "cuda">, %arg134: memref<1x512x7x7xf16, "cuda">, %arg135: memref<1x512x7x7xf16, "cuda">, %arg136: memref<512x512x3x3xf16, "cuda">, %arg137: memref<1x512x7x7xf16, "cuda">, %arg138: memref<1x512x7x7xf16, "cuda">, %arg139: memref<1x512xf16, "cuda">, %arg140: memref<512x1000xf16, "cuda">, %arg141: memref<1x1000xf16, "cuda">) -> (memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">) attributes {__placeholder__byre.entry_point} {
     %alloc = memref.alloc() : memref<1x512xf16, "cuda">
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda">
@@ -1805,7 +632,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_10, %arg128, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     %alloc_14 = memref.alloc() : memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %alloc_10, %alloc_14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %3 = call @Unknown12(%arg127, %alloc_13) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %3 = call @Unknown4(%arg127, %alloc_13) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     %alloc_15 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     %alloc_16 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_17 = memref.alloc() : memref<512xf32, "cuda">
@@ -1840,7 +667,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_30, %arg119, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
     %alloc_34 = memref.alloc() : memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %alloc_30, %alloc_34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %6 = call @Unknown27(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %6 = call @Unknown19(%4, %alloc_33, %arg118) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     %alloc_35 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     %alloc_36 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_37 = memref.alloc() : memref<256xf32, "cuda">
@@ -1849,7 +676,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_35, %arg114, %alloc_38) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
     %alloc_39 = memref.alloc() : memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %alloc_35, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %7 = call @Unknown31(%arg113, %alloc_38) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %7 = call @Unknown23(%arg113, %alloc_38) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     %alloc_41 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_42 = memref.alloc() : memref<256xf32, "cuda">
@@ -1884,7 +711,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_55, %arg105, %alloc_58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     %alloc_59 = memref.alloc() : memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %alloc_55, %alloc_59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %10 = call @Unknown46(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %10 = call @Unknown38(%8, %alloc_58, %arg104) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     %alloc_60 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     %alloc_61 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_62 = memref.alloc() : memref<128xf32, "cuda">
@@ -1893,7 +720,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_60, %arg100, %alloc_63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     %alloc_64 = memref.alloc() : memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %alloc_60, %alloc_64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %11 = call @Unknown50(%arg99, %alloc_63) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %11 = call @Unknown42(%arg99, %alloc_63) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     %alloc_65 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     %alloc_66 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_67 = memref.alloc() : memref<128xf32, "cuda">
@@ -1928,7 +755,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_80, %arg91, %alloc_83) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     %alloc_84 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %alloc_80, %alloc_84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %14 = call @Unknown65(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %14 = call @Unknown57(%12, %alloc_83, %arg90) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     %alloc_85 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     %alloc_86 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_87 = memref.alloc() : memref<64xf32, "cuda">
@@ -1937,7 +764,7 @@ module attributes {gpu.container_module} {
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_85, %arg88, %alloc_88) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     %alloc_89 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %alloc_85, %alloc_89) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %15 = call @Unknown69(%arg87, %alloc_88) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %15 = call @Unknown61(%arg87, %alloc_88) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     %alloc_90 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     %alloc_91 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_92 = memref.alloc() : memref<64xf32, "cuda">
@@ -1957,34 +784,31 @@ module attributes {gpu.container_module} {
     %alloc_99 = memref.alloc() : memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %alloc_96, %alloc_99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
     %18 = call @Unknown77(%alloc_99) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda">
-    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16, "cuda">) -> memref<1x1000xf32, "cuda">
-    %alloc_100 = memref.alloc() : memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%19, %alloc_100) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    %20 = call @Unknown79(%alloc_100) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda">
+    %19 = call @Unknown78(%arg141) : (memref<1x1000xf16, "cuda">) -> memref<1000xf32, "cuda">
     %collapse_shape = memref.collapse_shape %arg141 [[0, 1]] : memref<1x1000xf16, "cuda"> into memref<1000xf16, "cuda">
     %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<1000xf16, "cuda"> into memref<1000x1xf16, "cuda">
-    %alloc_101 = memref.alloc() : memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_101) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
-    %21 = call @Unknown80(%alloc_101) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda">
-    %22 = call @Unknown81(%alloc_94) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %23 = call @Unknown82(%alloc_89) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %24 = call @Unknown83(%alloc_84) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %25 = call @Unknown84(%alloc_79) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %26 = call @Unknown85(%alloc_69) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda">
-    %27 = call @Unknown86(%alloc_64) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %28 = call @Unknown87(%alloc_74) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda">
-    %29 = call @Unknown88(%alloc_59) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %30 = call @Unknown89(%alloc_54) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %31 = call @Unknown90(%alloc_44) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda">
-    %32 = call @Unknown91(%alloc_39) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %33 = call @Unknown92(%alloc_49) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda">
-    %34 = call @Unknown93(%alloc_34) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %35 = call @Unknown94(%alloc_29) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %36 = call @Unknown95(%alloc_19) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda">
-    %37 = call @Unknown96(%alloc_14) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    %38 = call @Unknown97(%alloc_24) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda">
-    %39 = call @Unknown98(%alloc_9) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    %40 = call @Unknown99(%alloc_4) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    return %alloc_98, %alloc_97, %18, %20, %21, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %22, %23, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %24, %25, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %26, %27, %28, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %29, %30, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %31, %32, %33, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %34, %35, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %36, %37, %38, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %39, %40 : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">
+    %alloc_100 = memref.alloc() : memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%expand_shape, %arg139, %alloc_100) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
+    %20 = call @Unknown79(%alloc_100) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda">
+    %21 = call @Unknown80(%alloc_94) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %22 = call @Unknown80(%alloc_89) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %23 = call @Unknown80(%alloc_84) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %24 = call @Unknown80(%alloc_79) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %25 = call @Unknown84(%alloc_69) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda">
+    %26 = call @Unknown85(%alloc_64) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %27 = call @Unknown86(%alloc_74) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda">
+    %28 = call @Unknown85(%alloc_59) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %29 = call @Unknown85(%alloc_54) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %30 = call @Unknown89(%alloc_44) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda">
+    %31 = call @Unknown90(%alloc_39) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %32 = call @Unknown91(%alloc_49) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda">
+    %33 = call @Unknown90(%alloc_34) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %34 = call @Unknown90(%alloc_29) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %35 = call @Unknown94(%alloc_19) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda">
+    %36 = call @Unknown95(%alloc_14) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    %37 = call @Unknown96(%alloc_24) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda">
+    %38 = call @Unknown95(%alloc_9) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    %39 = call @Unknown95(%alloc_4) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    return %alloc_98, %alloc_97, %18, %19, %20, %alloc_92, %alloc_91, %alloc_87, %alloc_86, %21, %22, %alloc_82, %alloc_81, %alloc_77, %alloc_76, %23, %24, %alloc_67, %alloc_66, %alloc_62, %alloc_61, %25, %26, %27, %alloc_72, %alloc_71, %alloc_57, %alloc_56, %alloc_52, %alloc_51, %28, %29, %alloc_42, %alloc_41, %alloc_37, %alloc_36, %30, %31, %32, %alloc_47, %alloc_46, %alloc_32, %alloc_31, %alloc_27, %alloc_26, %33, %34, %alloc_17, %alloc_16, %alloc_12, %alloc_11, %35, %36, %37, %alloc_22, %alloc_21, %alloc_7, %alloc_6, %alloc_2, %alloc_1, %38, %39 : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<1000xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf32, "cuda">
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir
index 775a491e2..80ac58119 100644
--- a/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir
+++ b/compiler/test/E2E/ResNet18/BW/9a_byre_host.mlir
@@ -4,915 +4,250 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
       %c131072 = arith.constant 131072 : index
+      %c0 = arith.constant 0 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
       %c32768 = arith.constant 32768 : index
+      %c0 = arith.constant 0 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
-      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
@@ -924,973 +259,478 @@ module attributes {byre.container_module, gpu.container_module} {
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16>
-        %7 = arith.extf %6 : f16 to f32
-        memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16>
+        %8 = arith.extf %7 : f16 to f32
+        %9 = arith.truncf %8 : f32 to f16
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32>
       }
       gpu.return
     }
     gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
     gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
       %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16>
-        %28 = arith.divf %27, %cst_0 : f16
-        %29 = arith.cmpf ogt, %26, %cst : f16
-        %30 = arith.select %29, %28, %cst : f16
-        memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.divf %11, %cst : f16
+        %14 = arith.cmpf ogt, %12, %cst_0 : f16
+        %15 = arith.select %14, %13, %cst_0 : f16
+        memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
   }
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
     %alloc = memref.alloc() : memref<25927680xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda">
-    byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
-    %49 = "byre.alias"(%arg141) {offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir
index e85948d3f..c4a8ef1c1 100644
--- a/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir
+++ b/compiler/test/E2E/ResNet18/BW/9b_nvvm_codegen.mlir
@@ -4,915 +4,250 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown99(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown96(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
       %c131072 = arith.constant 131072 : index
+      %c0 = arith.constant 0 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown96(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown95(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown94(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown94(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown92(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown91(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
       %c32768 = arith.constant 32768 : index
+      %c0 = arith.constant 0 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown90(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown89(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown89(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown86(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown86(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown85(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown85(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown84(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown80(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown81(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown79(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
-      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
@@ -924,973 +259,478 @@ module attributes {byre.container_module, gpu.container_module} {
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x1000xf16>
-        %7 = arith.extf %6 : f16 to f32
-        memref.store %7, %arg1[%c0, %4] : memref<1x1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x1000xf16>
+        %8 = arith.extf %7 : f16 to f32
+        %9 = arith.truncf %8 : f32 to f16
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%c0, %arg2] : memref<1x1000xf32>
       }
       gpu.return
     }
     gpu.func @Unknown77(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
     gpu.func @Unknown74(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>, %arg2: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown73(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        memref.store %28, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        memref.store %13, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown61(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown57(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>, %arg3: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown42(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown38(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>, %arg3: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown31(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>, %arg3: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
       %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>, %arg3: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %29 = arith.addf %27, %28 : f16
-        %30 = arith.cmpf ogt, %26, %cst : f16
-        %31 = arith.select %30, %29, %cst : f16
-        memref.store %31, %arg3[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = memref.load %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %14 = arith.addf %11, %12 : f16
+        %15 = arith.cmpf ogt, %13, %cst : f16
+        %16 = arith.select %15, %14, %cst : f16
+        memref.store %16, %arg3[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.cmpf ogt, %26, %cst : f16
-        %29 = arith.select %28, %27, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.cmpf ogt, %11, %cst : f16
+        %14 = arith.select %13, %12, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x512xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
       %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg0[%c0, %25] : memref<1x512xf16>
-        %28 = arith.divf %27, %cst_0 : f16
-        %29 = arith.cmpf ogt, %26, %cst : f16
-        %30 = arith.select %29, %28, %cst : f16
-        memref.store %30, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10] : memref<1x512xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.divf %11, %cst : f16
+        %14 = arith.cmpf ogt, %12, %cst_0 : f16
+        %15 = arith.select %14, %13, %cst_0 : f16
+        memref.store %15, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
   }
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
     %alloc = memref.alloc() : memref<25927680xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda">
-    byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
-    %49 = "byre.alias"(%arg141) {offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/BW/device_output.ptx b/compiler/test/E2E/ResNet18/BW/device_output.ptx
index 5b5b9aea4..8c22562a6 100644
--- a/compiler/test/E2E/ResNet18/BW/device_output.ptx
+++ b/compiler/test/E2E/ResNet18/BW/device_output.ptx
@@ -6,285 +6,8 @@
 .target sm_70
 .address_size 64
 
-	// .globl	Unknown99
-
-.visible .entry Unknown99(
-	.param .u64 Unknown99_param_0,
-	.param .u64 Unknown99_param_1,
-	.param .u64 Unknown99_param_2,
-	.param .u64 Unknown99_param_3,
-	.param .u64 Unknown99_param_4,
-	.param .u64 Unknown99_param_5,
-	.param .u64 Unknown99_param_6,
-	.param .u64 Unknown99_param_7,
-	.param .u64 Unknown99_param_8,
-	.param .u64 Unknown99_param_9,
-	.param .u64 Unknown99_param_10,
-	.param .u64 Unknown99_param_11,
-	.param .u64 Unknown99_param_12,
-	.param .u64 Unknown99_param_13,
-	.param .u64 Unknown99_param_14,
-	.param .u64 Unknown99_param_15,
-	.param .u64 Unknown99_param_16,
-	.param .u64 Unknown99_param_17,
-	.param .u64 Unknown99_param_18,
-	.param .u64 Unknown99_param_19,
-	.param .u64 Unknown99_param_20,
-	.param .u64 Unknown99_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB0_2;
-	ld.param.u64 	%rd4, [Unknown99_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown99_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB0_2:
-	ret;
-
-}
-	// .globl	Unknown98
-.visible .entry Unknown98(
-	.param .u64 Unknown98_param_0,
-	.param .u64 Unknown98_param_1,
-	.param .u64 Unknown98_param_2,
-	.param .u64 Unknown98_param_3,
-	.param .u64 Unknown98_param_4,
-	.param .u64 Unknown98_param_5,
-	.param .u64 Unknown98_param_6,
-	.param .u64 Unknown98_param_7,
-	.param .u64 Unknown98_param_8,
-	.param .u64 Unknown98_param_9,
-	.param .u64 Unknown98_param_10,
-	.param .u64 Unknown98_param_11,
-	.param .u64 Unknown98_param_12,
-	.param .u64 Unknown98_param_13,
-	.param .u64 Unknown98_param_14,
-	.param .u64 Unknown98_param_15,
-	.param .u64 Unknown98_param_16,
-	.param .u64 Unknown98_param_17,
-	.param .u64 Unknown98_param_18,
-	.param .u64 Unknown98_param_19,
-	.param .u64 Unknown98_param_20,
-	.param .u64 Unknown98_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB1_2;
-	ld.param.u64 	%rd4, [Unknown98_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown98_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB1_2:
-	ret;
-
-}
-	// .globl	Unknown97
-.visible .entry Unknown97(
-	.param .u64 Unknown97_param_0,
-	.param .u64 Unknown97_param_1,
-	.param .u64 Unknown97_param_2,
-	.param .u64 Unknown97_param_3,
-	.param .u64 Unknown97_param_4,
-	.param .u64 Unknown97_param_5,
-	.param .u64 Unknown97_param_6,
-	.param .u64 Unknown97_param_7,
-	.param .u64 Unknown97_param_8,
-	.param .u64 Unknown97_param_9,
-	.param .u64 Unknown97_param_10,
-	.param .u64 Unknown97_param_11,
-	.param .u64 Unknown97_param_12,
-	.param .u64 Unknown97_param_13,
-	.param .u64 Unknown97_param_14,
-	.param .u64 Unknown97_param_15,
-	.param .u64 Unknown97_param_16,
-	.param .u64 Unknown97_param_17,
-	.param .u64 Unknown97_param_18,
-	.param .u64 Unknown97_param_19,
-	.param .u64 Unknown97_param_20,
-	.param .u64 Unknown97_param_21
-)
-{
-	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 131071;
-	@%p1 bra 	$L__BB2_2;
-	ld.param.u64 	%rd4, [Unknown97_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown97_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 56;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -256;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 256;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 56;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 8;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 8;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 1;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.b16 	%h1, [%rd24];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd25, %rd22, 2;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.f32 	[%rd26], %f1;
-$L__BB2_2:
-	ret;
-
-}
 	// .globl	Unknown96
+
 .visible .entry Unknown96(
 	.param .u64 Unknown96_param_0,
 	.param .u64 Unknown96_param_1,
@@ -310,80 +33,42 @@ $L__BB2_2:
 	.param .u64 Unknown96_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB3_2;
-	ld.param.u64 	%rd4, [Unknown96_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown96_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB3_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 131071;
+	@%p1 bra 	$L__BB0_3;
+	ld.param.u64 	%rd15, [Unknown96_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown96_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB0_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 131072;
+	@%p2 bra 	$L__BB0_2;
+$L__BB0_3:
 	ret;
 
 }
@@ -413,80 +98,42 @@ $L__BB3_2:
 	.param .u64 Unknown95_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 1179647;
-	@%p1 bra 	$L__BB4_2;
-	ld.param.u64 	%rd4, [Unknown95_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown95_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB4_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 2359295;
+	@%p1 bra 	$L__BB1_3;
+	ld.param.u64 	%rd15, [Unknown95_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown95_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB1_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 2359296;
+	@%p2 bra 	$L__BB1_2;
+$L__BB1_3:
 	ret;
 
 }
@@ -515,255 +162,43 @@ $L__BB4_2:
 	.param .u64 Unknown94_param_20,
 	.param .u64 Unknown94_param_21
 )
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB5_2;
-	ld.param.u64 	%rd4, [Unknown94_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown94_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB5_2:
-	ret;
-
-}
-	// .globl	Unknown93
-.visible .entry Unknown93(
-	.param .u64 Unknown93_param_0,
-	.param .u64 Unknown93_param_1,
-	.param .u64 Unknown93_param_2,
-	.param .u64 Unknown93_param_3,
-	.param .u64 Unknown93_param_4,
-	.param .u64 Unknown93_param_5,
-	.param .u64 Unknown93_param_6,
-	.param .u64 Unknown93_param_7,
-	.param .u64 Unknown93_param_8,
-	.param .u64 Unknown93_param_9,
-	.param .u64 Unknown93_param_10,
-	.param .u64 Unknown93_param_11,
-	.param .u64 Unknown93_param_12,
-	.param .u64 Unknown93_param_13,
-	.param .u64 Unknown93_param_14,
-	.param .u64 Unknown93_param_15,
-	.param .u64 Unknown93_param_16,
-	.param .u64 Unknown93_param_17,
-	.param .u64 Unknown93_param_18,
-	.param .u64 Unknown93_param_19,
-	.param .u64 Unknown93_param_20,
-	.param .u64 Unknown93_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB6_2;
-	ld.param.u64 	%rd4, [Unknown93_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown93_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB6_2:
-	ret;
-
-}
-	// .globl	Unknown92
-.visible .entry Unknown92(
-	.param .u64 Unknown92_param_0,
-	.param .u64 Unknown92_param_1,
-	.param .u64 Unknown92_param_2,
-	.param .u64 Unknown92_param_3,
-	.param .u64 Unknown92_param_4,
-	.param .u64 Unknown92_param_5,
-	.param .u64 Unknown92_param_6,
-	.param .u64 Unknown92_param_7,
-	.param .u64 Unknown92_param_8,
-	.param .u64 Unknown92_param_9,
-	.param .u64 Unknown92_param_10,
-	.param .u64 Unknown92_param_11,
-	.param .u64 Unknown92_param_12,
-	.param .u64 Unknown92_param_13,
-	.param .u64 Unknown92_param_14,
-	.param .u64 Unknown92_param_15,
-	.param .u64 Unknown92_param_16,
-	.param .u64 Unknown92_param_17,
-	.param .u64 Unknown92_param_18,
-	.param .u64 Unknown92_param_19,
-	.param .u64 Unknown92_param_20,
-	.param .u64 Unknown92_param_21
-)
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 32767;
-	@%p1 bra 	$L__BB7_2;
-	ld.param.u64 	%rd4, [Unknown92_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown92_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 57;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -128;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 128;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 57;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 7;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 7;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 1;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.b16 	%h1, [%rd24];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd25, %rd22, 2;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.f32 	[%rd26], %f1;
-$L__BB7_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 1179647;
+	@%p1 bra 	$L__BB2_3;
+	ld.param.u64 	%rd15, [Unknown94_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown94_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB2_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 1179648;
+	@%p2 bra 	$L__BB2_2;
+$L__BB2_3:
 	ret;
 
 }
@@ -793,80 +228,42 @@ $L__BB7_2:
 	.param .u64 Unknown91_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB8_2;
-	ld.param.u64 	%rd4, [Unknown91_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown91_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB8_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 32767;
+	@%p1 bra 	$L__BB3_3;
+	ld.param.u64 	%rd15, [Unknown91_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown91_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB3_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 32768;
+	@%p2 bra 	$L__BB3_2;
+$L__BB3_3:
 	ret;
 
 }
@@ -896,80 +293,42 @@ $L__BB8_2:
 	.param .u64 Unknown90_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 294911;
-	@%p1 bra 	$L__BB9_2;
-	ld.param.u64 	%rd4, [Unknown90_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown90_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB9_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 589823;
+	@%p1 bra 	$L__BB4_3;
+	ld.param.u64 	%rd15, [Unknown90_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown90_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB4_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 589824;
+	@%p2 bra 	$L__BB4_2;
+$L__BB4_3:
 	ret;
 
 }
@@ -998,255 +357,43 @@ $L__BB9_2:
 	.param .u64 Unknown89_param_20,
 	.param .u64 Unknown89_param_21
 )
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB10_2;
-	ld.param.u64 	%rd4, [Unknown89_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown89_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB10_2:
-	ret;
-
-}
-	// .globl	Unknown88
-.visible .entry Unknown88(
-	.param .u64 Unknown88_param_0,
-	.param .u64 Unknown88_param_1,
-	.param .u64 Unknown88_param_2,
-	.param .u64 Unknown88_param_3,
-	.param .u64 Unknown88_param_4,
-	.param .u64 Unknown88_param_5,
-	.param .u64 Unknown88_param_6,
-	.param .u64 Unknown88_param_7,
-	.param .u64 Unknown88_param_8,
-	.param .u64 Unknown88_param_9,
-	.param .u64 Unknown88_param_10,
-	.param .u64 Unknown88_param_11,
-	.param .u64 Unknown88_param_12,
-	.param .u64 Unknown88_param_13,
-	.param .u64 Unknown88_param_14,
-	.param .u64 Unknown88_param_15,
-	.param .u64 Unknown88_param_16,
-	.param .u64 Unknown88_param_17,
-	.param .u64 Unknown88_param_18,
-	.param .u64 Unknown88_param_19,
-	.param .u64 Unknown88_param_20,
-	.param .u64 Unknown88_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB11_2;
-	ld.param.u64 	%rd4, [Unknown88_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown88_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB11_2:
-	ret;
-
-}
-	// .globl	Unknown87
-.visible .entry Unknown87(
-	.param .u64 Unknown87_param_0,
-	.param .u64 Unknown87_param_1,
-	.param .u64 Unknown87_param_2,
-	.param .u64 Unknown87_param_3,
-	.param .u64 Unknown87_param_4,
-	.param .u64 Unknown87_param_5,
-	.param .u64 Unknown87_param_6,
-	.param .u64 Unknown87_param_7,
-	.param .u64 Unknown87_param_8,
-	.param .u64 Unknown87_param_9,
-	.param .u64 Unknown87_param_10,
-	.param .u64 Unknown87_param_11,
-	.param .u64 Unknown87_param_12,
-	.param .u64 Unknown87_param_13,
-	.param .u64 Unknown87_param_14,
-	.param .u64 Unknown87_param_15,
-	.param .u64 Unknown87_param_16,
-	.param .u64 Unknown87_param_17,
-	.param .u64 Unknown87_param_18,
-	.param .u64 Unknown87_param_19,
-	.param .u64 Unknown87_param_20,
-	.param .u64 Unknown87_param_21
-)
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 8191;
-	@%p1 bra 	$L__BB12_2;
-	ld.param.u64 	%rd4, [Unknown87_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown87_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 58;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -64;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 64;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 58;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 6;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 6;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 1;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.b16 	%h1, [%rd24];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd25, %rd22, 2;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.f32 	[%rd26], %f1;
-$L__BB12_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 294911;
+	@%p1 bra 	$L__BB5_3;
+	ld.param.u64 	%rd15, [Unknown89_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown89_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB5_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 294912;
+	@%p2 bra 	$L__BB5_2;
+$L__BB5_3:
 	ret;
 
 }
@@ -1276,80 +423,42 @@ $L__BB12_2:
 	.param .u64 Unknown86_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB13_2;
-	ld.param.u64 	%rd4, [Unknown86_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown86_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB13_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 8191;
+	@%p1 bra 	$L__BB6_3;
+	ld.param.u64 	%rd15, [Unknown86_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown86_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB6_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 8192;
+	@%p2 bra 	$L__BB6_2;
+$L__BB6_3:
 	ret;
 
 }
@@ -1379,80 +488,42 @@ $L__BB13_2:
 	.param .u64 Unknown85_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 73727;
-	@%p1 bra 	$L__BB14_2;
-	ld.param.u64 	%rd4, [Unknown85_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown85_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB14_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 147455;
+	@%p1 bra 	$L__BB7_3;
+	ld.param.u64 	%rd15, [Unknown85_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown85_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB7_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 147456;
+	@%p2 bra 	$L__BB7_2;
+$L__BB7_3:
 	ret;
 
 }
@@ -1482,389 +553,42 @@ $L__BB14_2:
 	.param .u64 Unknown84_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB15_2;
-	ld.param.u64 	%rd4, [Unknown84_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown84_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB15_2:
-	ret;
-
-}
-	// .globl	Unknown83
-.visible .entry Unknown83(
-	.param .u64 Unknown83_param_0,
-	.param .u64 Unknown83_param_1,
-	.param .u64 Unknown83_param_2,
-	.param .u64 Unknown83_param_3,
-	.param .u64 Unknown83_param_4,
-	.param .u64 Unknown83_param_5,
-	.param .u64 Unknown83_param_6,
-	.param .u64 Unknown83_param_7,
-	.param .u64 Unknown83_param_8,
-	.param .u64 Unknown83_param_9,
-	.param .u64 Unknown83_param_10,
-	.param .u64 Unknown83_param_11,
-	.param .u64 Unknown83_param_12,
-	.param .u64 Unknown83_param_13,
-	.param .u64 Unknown83_param_14,
-	.param .u64 Unknown83_param_15,
-	.param .u64 Unknown83_param_16,
-	.param .u64 Unknown83_param_17,
-	.param .u64 Unknown83_param_18,
-	.param .u64 Unknown83_param_19,
-	.param .u64 Unknown83_param_20,
-	.param .u64 Unknown83_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB16_2;
-	ld.param.u64 	%rd4, [Unknown83_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown83_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB16_2:
-	ret;
-
-}
-	// .globl	Unknown82
-.visible .entry Unknown82(
-	.param .u64 Unknown82_param_0,
-	.param .u64 Unknown82_param_1,
-	.param .u64 Unknown82_param_2,
-	.param .u64 Unknown82_param_3,
-	.param .u64 Unknown82_param_4,
-	.param .u64 Unknown82_param_5,
-	.param .u64 Unknown82_param_6,
-	.param .u64 Unknown82_param_7,
-	.param .u64 Unknown82_param_8,
-	.param .u64 Unknown82_param_9,
-	.param .u64 Unknown82_param_10,
-	.param .u64 Unknown82_param_11,
-	.param .u64 Unknown82_param_12,
-	.param .u64 Unknown82_param_13,
-	.param .u64 Unknown82_param_14,
-	.param .u64 Unknown82_param_15,
-	.param .u64 Unknown82_param_16,
-	.param .u64 Unknown82_param_17,
-	.param .u64 Unknown82_param_18,
-	.param .u64 Unknown82_param_19,
-	.param .u64 Unknown82_param_20,
-	.param .u64 Unknown82_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB17_2;
-	ld.param.u64 	%rd4, [Unknown82_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown82_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB17_2:
-	ret;
-
-}
-	// .globl	Unknown81
-.visible .entry Unknown81(
-	.param .u64 Unknown81_param_0,
-	.param .u64 Unknown81_param_1,
-	.param .u64 Unknown81_param_2,
-	.param .u64 Unknown81_param_3,
-	.param .u64 Unknown81_param_4,
-	.param .u64 Unknown81_param_5,
-	.param .u64 Unknown81_param_6,
-	.param .u64 Unknown81_param_7,
-	.param .u64 Unknown81_param_8,
-	.param .u64 Unknown81_param_9,
-	.param .u64 Unknown81_param_10,
-	.param .u64 Unknown81_param_11,
-	.param .u64 Unknown81_param_12,
-	.param .u64 Unknown81_param_13,
-	.param .u64 Unknown81_param_14,
-	.param .u64 Unknown81_param_15,
-	.param .u64 Unknown81_param_16,
-	.param .u64 Unknown81_param_17,
-	.param .u64 Unknown81_param_18,
-	.param .u64 Unknown81_param_19,
-	.param .u64 Unknown81_param_20,
-	.param .u64 Unknown81_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB18_2;
-	ld.param.u64 	%rd4, [Unknown81_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown81_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 1;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.b16 	%h1, [%rd54];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd55, %rd52, 2;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.f32 	[%rd56], %f1;
-$L__BB18_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 73727;
+	@%p1 bra 	$L__BB8_3;
+	ld.param.u64 	%rd15, [Unknown84_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown84_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB8_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 73728;
+	@%p2 bra 	$L__BB8_2;
+$L__BB8_3:
 	ret;
 
 }
@@ -1883,51 +607,53 @@ $L__BB18_2:
 	.param .u64 Unknown80_param_10,
 	.param .u64 Unknown80_param_11,
 	.param .u64 Unknown80_param_12,
-	.param .u64 Unknown80_param_13
+	.param .u64 Unknown80_param_13,
+	.param .u64 Unknown80_param_14,
+	.param .u64 Unknown80_param_15,
+	.param .u64 Unknown80_param_16,
+	.param .u64 Unknown80_param_17,
+	.param .u64 Unknown80_param_18,
+	.param .u64 Unknown80_param_19,
+	.param .u64 Unknown80_param_20,
+	.param .u64 Unknown80_param_21
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 511999;
-	@%p1 bra 	$L__BB19_2;
-	ld.param.u64 	%rd4, [Unknown80_param_8];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown80_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 55;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -512;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 512;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 55;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 9;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 9;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 1;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.b16 	%h1, [%rd24];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd25, %rd22, 2;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.f32 	[%rd26], %f1;
-$L__BB19_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 36863;
+	@%p1 bra 	$L__BB9_3;
+	ld.param.u64 	%rd15, [Unknown80_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown80_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB9_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 36864;
+	@%p2 bra 	$L__BB9_2;
+$L__BB9_3:
 	ret;
 
 }
@@ -1942,35 +668,49 @@ $L__BB19_2:
 	.param .u64 Unknown79_param_6,
 	.param .u64 Unknown79_param_7,
 	.param .u64 Unknown79_param_8,
-	.param .u64 Unknown79_param_9
+	.param .u64 Unknown79_param_9,
+	.param .u64 Unknown79_param_10,
+	.param .u64 Unknown79_param_11,
+	.param .u64 Unknown79_param_12,
+	.param .u64 Unknown79_param_13
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<11>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<2>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd7, %r3;
-	mul.wide.s32 	%rd8, %r2, %r1;
-	add.s64 	%rd9, %rd8, %rd7;
-	setp.gt.s64 	%p1, %rd9, 999;
-	@%p1 bra 	$L__BB20_2;
-	ld.param.u64 	%rd3, [Unknown79_param_6];
-	cvta.to.global.u64 	%rd4, %rd3;
-	ld.param.u64 	%rd5, [Unknown79_param_1];
-	cvta.to.global.u64 	%rd6, %rd5;
-	shl.b64 	%rd10, %rd9, 2;
-	add.s64 	%rd1, %rd6, %rd10;
-	add.s64 	%rd2, %rd4, %rd10;
-	ld.global.f32 	%f1, [%rd1];
-	cvt.rn.f16.f32 	%h1, %f1;
-	cvt.f32.f16 	%f2, %h1;
-	st.global.f32 	[%rd2], %f2;
-$L__BB20_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 511999;
+	@%p1 bra 	$L__BB10_3;
+	ld.param.u64 	%rd15, [Unknown79_param_8];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown79_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB10_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 512000;
+	@%p2 bra 	$L__BB10_2;
+$L__BB10_3:
 	ret;
 
 }
@@ -1992,32 +732,42 @@ $L__BB20_2:
 	.param .u64 Unknown78_param_13
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<12>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd7, %r3;
-	mul.wide.s32 	%rd8, %r2, %r1;
-	add.s64 	%rd9, %rd8, %rd7;
-	setp.gt.s64 	%p1, %rd9, 999;
-	@%p1 bra 	$L__BB21_2;
-	ld.param.u64 	%rd3, [Unknown78_param_8];
-	cvta.to.global.u64 	%rd4, %rd3;
-	ld.param.u64 	%rd5, [Unknown78_param_1];
-	cvta.to.global.u64 	%rd6, %rd5;
-	shl.b64 	%rd10, %rd9, 1;
-	add.s64 	%rd1, %rd6, %rd10;
-	shl.b64 	%rd11, %rd9, 2;
-	add.s64 	%rd2, %rd4, %rd11;
-	ld.global.b16 	%h1, [%rd1];
-	cvt.f32.f16 	%f1, %h1;
-	st.global.f32 	[%rd2], %f1;
-$L__BB21_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 999;
+	@%p1 bra 	$L__BB11_3;
+	ld.param.u64 	%rd15, [Unknown78_param_8];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown78_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB11_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 1000;
+	@%p2 bra 	$L__BB11_2;
+$L__BB11_3:
 	ret;
 
 }
@@ -2047,84 +797,42 @@ $L__BB21_2:
 	.param .u64 Unknown77_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<61>;
+	.reg .b64 	%rd<24>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 9407;
-	@%p1 bra 	$L__BB22_2;
-	ld.param.u64 	%rd4, [Unknown77_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown77_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 1;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 7;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 7;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 1;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 1;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 7;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 7;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 1;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.hi.s64 	%rd38, %rd37, 6148914691236517206;
-	shr.u64 	%rd39, %rd38, 63;
-	add.s64 	%rd40, %rd38, %rd39;
-	mul.lo.s64 	%rd41, %rd40, 3;
-	sub.s64 	%rd42, %rd37, %rd41;
-	setp.lt.s64 	%p4, %rd42, 0;
-	add.s64 	%rd43, %rd42, 3;
-	selp.b64 	%rd44, %rd43, %rd42, %p4;
-	shr.s64 	%rd45, %rd37, 63;
-	xor.b64  	%rd46, %rd45, %rd37;
-	mul.hi.s64 	%rd47, %rd46, 6148914691236517206;
-	shr.u64 	%rd48, %rd47, 63;
-	add.s64 	%rd49, %rd47, %rd48;
-	xor.b64  	%rd50, %rd49, %rd45;
-	mul.lo.s64 	%rd51, %rd50, 147;
-	mul.lo.s64 	%rd52, %rd44, 49;
-	mul.lo.s64 	%rd53, %rd30, 7;
-	add.s64 	%rd54, %rd53, %rd15;
-	add.s64 	%rd55, %rd54, %rd52;
-	add.s64 	%rd56, %rd55, %rd51;
-	shl.b64 	%rd57, %rd56, 1;
-	add.s64 	%rd58, %rd2, %rd57;
-	ld.global.b16 	%h1, [%rd58];
-	cvt.f32.f16 	%f1, %h1;
-	shl.b64 	%rd59, %rd56, 2;
-	add.s64 	%rd60, %rd1, %rd59;
-	st.global.f32 	[%rd60], %f1;
-$L__BB22_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 9407;
+	@%p1 bra 	$L__BB12_3;
+	ld.param.u64 	%rd15, [Unknown77_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown77_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 1;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 1;
+	shl.b64 	%rd20, %rd23, 2;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 2;
+$L__BB12_2:
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.f32.f16 	%f1, %rs1;
+	st.global.f32 	[%rd21], %f1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 9408;
+	@%p2 bra 	$L__BB12_2;
+$L__BB12_3:
 	ret;
 
 }
@@ -2165,72 +873,44 @@ $L__BB22_2:
 	.param .u64 Unknown74_param_32
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 802815;
-	@%p1 bra 	$L__BB23_2;
-	ld.param.u64 	%rd5, [Unknown74_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown74_param_1];
-	ld.param.u64 	%rd7, [Unknown74_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 5;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 112;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 112;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 5;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 5;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 112;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 112;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 5;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 12544;
-	mul.lo.s64 	%rd41, %rd32, 112;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB23_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 802815;
+	@%p1 bra 	$L__BB13_3;
+	ld.param.u64 	%rd12, [Unknown74_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown74_param_1];
+	ld.param.u64 	%rd14, [Unknown74_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB13_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	mov.b16 	%rs3, 0x0000;
+	setp.gt.f16 	%p2, %rs1, %rs3;
+	selp.b16 	%rs4, %rs2, 0x0000, %p2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs4;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p3, %rd21, 802816;
+	@%p3 bra 	$L__BB13_2;
+$L__BB13_3:
 	ret;
 
 }
@@ -2271,298 +951,42 @@ $L__BB23_2:
 	.param .u64 Unknown73_param_32
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<4>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 200703;
-	@%p1 bra 	$L__BB24_2;
-	ld.param.u64 	%rd5, [Unknown73_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown73_param_1];
-	ld.param.u64 	%rd7, [Unknown73_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 4;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 56;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 56;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 4;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 4;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 56;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 56;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 4;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 3136;
-	mul.lo.s64 	%rd41, %rd32, 56;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h3;
-$L__BB24_2:
-	ret;
-
-}
-	// .globl	Unknown69
-.visible .entry Unknown69(
-	.param .u64 Unknown69_param_0,
-	.param .u64 Unknown69_param_1,
-	.param .u64 Unknown69_param_2,
-	.param .u64 Unknown69_param_3,
-	.param .u64 Unknown69_param_4,
-	.param .u64 Unknown69_param_5,
-	.param .u64 Unknown69_param_6,
-	.param .u64 Unknown69_param_7,
-	.param .u64 Unknown69_param_8,
-	.param .u64 Unknown69_param_9,
-	.param .u64 Unknown69_param_10,
-	.param .u64 Unknown69_param_11,
-	.param .u64 Unknown69_param_12,
-	.param .u64 Unknown69_param_13,
-	.param .u64 Unknown69_param_14,
-	.param .u64 Unknown69_param_15,
-	.param .u64 Unknown69_param_16,
-	.param .u64 Unknown69_param_17,
-	.param .u64 Unknown69_param_18,
-	.param .u64 Unknown69_param_19,
-	.param .u64 Unknown69_param_20,
-	.param .u64 Unknown69_param_21,
-	.param .u64 Unknown69_param_22,
-	.param .u64 Unknown69_param_23,
-	.param .u64 Unknown69_param_24,
-	.param .u64 Unknown69_param_25,
-	.param .u64 Unknown69_param_26,
-	.param .u64 Unknown69_param_27,
-	.param .u64 Unknown69_param_28,
-	.param .u64 Unknown69_param_29,
-	.param .u64 Unknown69_param_30,
-	.param .u64 Unknown69_param_31,
-	.param .u64 Unknown69_param_32
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 200703;
-	@%p1 bra 	$L__BB25_2;
-	ld.param.u64 	%rd5, [Unknown69_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown69_param_1];
-	ld.param.u64 	%rd7, [Unknown69_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 4;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 56;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 56;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 4;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 4;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 56;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 56;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 4;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 3136;
-	mul.lo.s64 	%rd41, %rd32, 56;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB25_2:
-	ret;
-
-}
-	// .globl	Unknown65
-.visible .entry Unknown65(
-	.param .u64 Unknown65_param_0,
-	.param .u64 Unknown65_param_1,
-	.param .u64 Unknown65_param_2,
-	.param .u64 Unknown65_param_3,
-	.param .u64 Unknown65_param_4,
-	.param .u64 Unknown65_param_5,
-	.param .u64 Unknown65_param_6,
-	.param .u64 Unknown65_param_7,
-	.param .u64 Unknown65_param_8,
-	.param .u64 Unknown65_param_9,
-	.param .u64 Unknown65_param_10,
-	.param .u64 Unknown65_param_11,
-	.param .u64 Unknown65_param_12,
-	.param .u64 Unknown65_param_13,
-	.param .u64 Unknown65_param_14,
-	.param .u64 Unknown65_param_15,
-	.param .u64 Unknown65_param_16,
-	.param .u64 Unknown65_param_17,
-	.param .u64 Unknown65_param_18,
-	.param .u64 Unknown65_param_19,
-	.param .u64 Unknown65_param_20,
-	.param .u64 Unknown65_param_21,
-	.param .u64 Unknown65_param_22,
-	.param .u64 Unknown65_param_23,
-	.param .u64 Unknown65_param_24,
-	.param .u64 Unknown65_param_25,
-	.param .u64 Unknown65_param_26,
-	.param .u64 Unknown65_param_27,
-	.param .u64 Unknown65_param_28,
-	.param .u64 Unknown65_param_29,
-	.param .u64 Unknown65_param_30,
-	.param .u64 Unknown65_param_31,
-	.param .u64 Unknown65_param_32,
-	.param .u64 Unknown65_param_33,
-	.param .u64 Unknown65_param_34,
-	.param .u64 Unknown65_param_35,
-	.param .u64 Unknown65_param_36,
-	.param .u64 Unknown65_param_37,
-	.param .u64 Unknown65_param_38,
-	.param .u64 Unknown65_param_39,
-	.param .u64 Unknown65_param_40,
-	.param .u64 Unknown65_param_41,
-	.param .u64 Unknown65_param_42,
-	.param .u64 Unknown65_param_43
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 200703;
-	@%p1 bra 	$L__BB26_2;
-	ld.param.u64 	%rd6, [Unknown65_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown65_param_1];
-	ld.param.u64 	%rd8, [Unknown65_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown65_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 4;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 56;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 56;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 4;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 4;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 56;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 56;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 4;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 3136;
-	mul.lo.s64 	%rd43, %rd34, 56;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB26_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 200703;
+	@%p1 bra 	$L__BB14_3;
+	ld.param.u64 	%rd12, [Unknown73_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown73_param_1];
+	ld.param.u64 	%rd14, [Unknown73_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB14_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs3;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 200704;
+	@%p2 bra 	$L__BB14_2;
+$L__BB14_3:
 	ret;
 
 }
@@ -2603,72 +1027,44 @@ $L__BB26_2:
 	.param .u64 Unknown61_param_32
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 200703;
-	@%p1 bra 	$L__BB27_2;
-	ld.param.u64 	%rd5, [Unknown61_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown61_param_1];
-	ld.param.u64 	%rd7, [Unknown61_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 4;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 56;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 56;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 4;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 4;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 56;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 56;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 4;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 3136;
-	mul.lo.s64 	%rd41, %rd32, 56;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB27_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 200703;
+	@%p1 bra 	$L__BB15_3;
+	ld.param.u64 	%rd12, [Unknown61_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown61_param_1];
+	ld.param.u64 	%rd14, [Unknown61_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB15_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	mov.b16 	%rs3, 0x0000;
+	setp.gt.f16 	%p2, %rs1, %rs3;
+	selp.b16 	%rs4, %rs2, 0x0000, %p2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs4;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p3, %rd21, 200704;
+	@%p3 bra 	$L__BB15_2;
+$L__BB15_3:
 	ret;
 
 }
@@ -2720,305 +1116,49 @@ $L__BB27_2:
 	.param .u64 Unknown57_param_43
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 200703;
-	@%p1 bra 	$L__BB28_2;
-	ld.param.u64 	%rd6, [Unknown57_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown57_param_1];
-	ld.param.u64 	%rd8, [Unknown57_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown57_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 4;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 56;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 56;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 4;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 4;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 56;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 56;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 4;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 3136;
-	mul.lo.s64 	%rd43, %rd34, 56;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB28_2:
-	ret;
-
-}
-	// .globl	Unknown50
-.visible .entry Unknown50(
-	.param .u64 Unknown50_param_0,
-	.param .u64 Unknown50_param_1,
-	.param .u64 Unknown50_param_2,
-	.param .u64 Unknown50_param_3,
-	.param .u64 Unknown50_param_4,
-	.param .u64 Unknown50_param_5,
-	.param .u64 Unknown50_param_6,
-	.param .u64 Unknown50_param_7,
-	.param .u64 Unknown50_param_8,
-	.param .u64 Unknown50_param_9,
-	.param .u64 Unknown50_param_10,
-	.param .u64 Unknown50_param_11,
-	.param .u64 Unknown50_param_12,
-	.param .u64 Unknown50_param_13,
-	.param .u64 Unknown50_param_14,
-	.param .u64 Unknown50_param_15,
-	.param .u64 Unknown50_param_16,
-	.param .u64 Unknown50_param_17,
-	.param .u64 Unknown50_param_18,
-	.param .u64 Unknown50_param_19,
-	.param .u64 Unknown50_param_20,
-	.param .u64 Unknown50_param_21,
-	.param .u64 Unknown50_param_22,
-	.param .u64 Unknown50_param_23,
-	.param .u64 Unknown50_param_24,
-	.param .u64 Unknown50_param_25,
-	.param .u64 Unknown50_param_26,
-	.param .u64 Unknown50_param_27,
-	.param .u64 Unknown50_param_28,
-	.param .u64 Unknown50_param_29,
-	.param .u64 Unknown50_param_30,
-	.param .u64 Unknown50_param_31,
-	.param .u64 Unknown50_param_32
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 100351;
-	@%p1 bra 	$L__BB29_2;
-	ld.param.u64 	%rd5, [Unknown50_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown50_param_1];
-	ld.param.u64 	%rd7, [Unknown50_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 28;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 28;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 3;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 28;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 28;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 3;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 784;
-	mul.lo.s64 	%rd41, %rd32, 28;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB29_2:
-	ret;
-
-}
-	// .globl	Unknown46
-.visible .entry Unknown46(
-	.param .u64 Unknown46_param_0,
-	.param .u64 Unknown46_param_1,
-	.param .u64 Unknown46_param_2,
-	.param .u64 Unknown46_param_3,
-	.param .u64 Unknown46_param_4,
-	.param .u64 Unknown46_param_5,
-	.param .u64 Unknown46_param_6,
-	.param .u64 Unknown46_param_7,
-	.param .u64 Unknown46_param_8,
-	.param .u64 Unknown46_param_9,
-	.param .u64 Unknown46_param_10,
-	.param .u64 Unknown46_param_11,
-	.param .u64 Unknown46_param_12,
-	.param .u64 Unknown46_param_13,
-	.param .u64 Unknown46_param_14,
-	.param .u64 Unknown46_param_15,
-	.param .u64 Unknown46_param_16,
-	.param .u64 Unknown46_param_17,
-	.param .u64 Unknown46_param_18,
-	.param .u64 Unknown46_param_19,
-	.param .u64 Unknown46_param_20,
-	.param .u64 Unknown46_param_21,
-	.param .u64 Unknown46_param_22,
-	.param .u64 Unknown46_param_23,
-	.param .u64 Unknown46_param_24,
-	.param .u64 Unknown46_param_25,
-	.param .u64 Unknown46_param_26,
-	.param .u64 Unknown46_param_27,
-	.param .u64 Unknown46_param_28,
-	.param .u64 Unknown46_param_29,
-	.param .u64 Unknown46_param_30,
-	.param .u64 Unknown46_param_31,
-	.param .u64 Unknown46_param_32,
-	.param .u64 Unknown46_param_33,
-	.param .u64 Unknown46_param_34,
-	.param .u64 Unknown46_param_35,
-	.param .u64 Unknown46_param_36,
-	.param .u64 Unknown46_param_37,
-	.param .u64 Unknown46_param_38,
-	.param .u64 Unknown46_param_39,
-	.param .u64 Unknown46_param_40,
-	.param .u64 Unknown46_param_41,
-	.param .u64 Unknown46_param_42,
-	.param .u64 Unknown46_param_43
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<25>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 100351;
-	@%p1 bra 	$L__BB30_2;
-	ld.param.u64 	%rd6, [Unknown46_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown46_param_1];
-	ld.param.u64 	%rd8, [Unknown46_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown46_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 3;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 28;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 28;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 3;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 3;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 28;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 28;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 3;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 784;
-	mul.lo.s64 	%rd43, %rd34, 28;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB30_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd24, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd24, 200703;
+	@%p1 bra 	$L__BB16_3;
+	ld.param.u64 	%rd13, [Unknown57_param_34];
+	cvta.to.global.u64 	%rd1, %rd13;
+	ld.param.u64 	%rd14, [Unknown57_param_1];
+	ld.param.u64 	%rd15, [Unknown57_param_23];
+	cvta.to.global.u64 	%rd2, %rd15;
+	ld.param.u64 	%rd16, [Unknown57_param_12];
+	cvta.to.global.u64 	%rd3, %rd16;
+	cvta.to.global.u64 	%rd4, %rd14;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd6, %r2, %r4;
+	shl.b64 	%rd23, %rd24, 1;
+	shl.b64 	%rd8, %rd6, 1;
+$L__BB16_2:
+	add.s64 	%rd19, %rd4, %rd23;
+	ld.global.nc.u16 	%rs1, [%rd19];
+	add.s64 	%rd20, %rd3, %rd23;
+	ld.global.nc.u16 	%rs2, [%rd20];
+	add.s64 	%rd21, %rd2, %rd23;
+	ld.global.nc.u16 	%rs3, [%rd21];
+	add.rn.f16 	%rs4, %rs1, %rs2;
+	mov.b16 	%rs5, 0x0000;
+	setp.gt.f16 	%p2, %rs3, %rs5;
+	selp.b16 	%rs6, %rs4, 0x0000, %p2;
+	add.s64 	%rd22, %rd1, %rd23;
+	st.global.b16 	[%rd22], %rs6;
+	add.s64 	%rd24, %rd24, %rd6;
+	add.s64 	%rd23, %rd23, %rd8;
+	setp.lt.s64 	%p3, %rd24, 200704;
+	@%p3 bra 	$L__BB16_2;
+$L__BB16_3:
 	ret;
 
 }
@@ -3059,72 +1199,44 @@ $L__BB30_2:
 	.param .u64 Unknown42_param_32
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 100351;
-	@%p1 bra 	$L__BB31_2;
-	ld.param.u64 	%rd5, [Unknown42_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown42_param_1];
-	ld.param.u64 	%rd7, [Unknown42_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 28;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 28;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 3;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 28;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 28;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 3;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 784;
-	mul.lo.s64 	%rd41, %rd32, 28;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB31_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 100351;
+	@%p1 bra 	$L__BB17_3;
+	ld.param.u64 	%rd12, [Unknown42_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown42_param_1];
+	ld.param.u64 	%rd14, [Unknown42_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB17_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	mov.b16 	%rs3, 0x0000;
+	setp.gt.f16 	%p2, %rs1, %rs3;
+	selp.b16 	%rs4, %rs2, 0x0000, %p2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs4;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p3, %rd21, 100352;
+	@%p3 bra 	$L__BB17_2;
+$L__BB17_3:
 	ret;
 
 }
@@ -3176,305 +1288,49 @@ $L__BB31_2:
 	.param .u64 Unknown38_param_43
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 100351;
-	@%p1 bra 	$L__BB32_2;
-	ld.param.u64 	%rd6, [Unknown38_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown38_param_1];
-	ld.param.u64 	%rd8, [Unknown38_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown38_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 3;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 28;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 28;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 3;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 3;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 28;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 28;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 3;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 784;
-	mul.lo.s64 	%rd43, %rd34, 28;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB32_2:
-	ret;
-
-}
-	// .globl	Unknown31
-.visible .entry Unknown31(
-	.param .u64 Unknown31_param_0,
-	.param .u64 Unknown31_param_1,
-	.param .u64 Unknown31_param_2,
-	.param .u64 Unknown31_param_3,
-	.param .u64 Unknown31_param_4,
-	.param .u64 Unknown31_param_5,
-	.param .u64 Unknown31_param_6,
-	.param .u64 Unknown31_param_7,
-	.param .u64 Unknown31_param_8,
-	.param .u64 Unknown31_param_9,
-	.param .u64 Unknown31_param_10,
-	.param .u64 Unknown31_param_11,
-	.param .u64 Unknown31_param_12,
-	.param .u64 Unknown31_param_13,
-	.param .u64 Unknown31_param_14,
-	.param .u64 Unknown31_param_15,
-	.param .u64 Unknown31_param_16,
-	.param .u64 Unknown31_param_17,
-	.param .u64 Unknown31_param_18,
-	.param .u64 Unknown31_param_19,
-	.param .u64 Unknown31_param_20,
-	.param .u64 Unknown31_param_21,
-	.param .u64 Unknown31_param_22,
-	.param .u64 Unknown31_param_23,
-	.param .u64 Unknown31_param_24,
-	.param .u64 Unknown31_param_25,
-	.param .u64 Unknown31_param_26,
-	.param .u64 Unknown31_param_27,
-	.param .u64 Unknown31_param_28,
-	.param .u64 Unknown31_param_29,
-	.param .u64 Unknown31_param_30,
-	.param .u64 Unknown31_param_31,
-	.param .u64 Unknown31_param_32
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 50175;
-	@%p1 bra 	$L__BB33_2;
-	ld.param.u64 	%rd5, [Unknown31_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown31_param_1];
-	ld.param.u64 	%rd7, [Unknown31_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 2;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 14;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 14;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 2;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 2;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 14;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 14;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 2;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 196;
-	mul.lo.s64 	%rd41, %rd32, 14;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB33_2:
-	ret;
-
-}
-	// .globl	Unknown27
-.visible .entry Unknown27(
-	.param .u64 Unknown27_param_0,
-	.param .u64 Unknown27_param_1,
-	.param .u64 Unknown27_param_2,
-	.param .u64 Unknown27_param_3,
-	.param .u64 Unknown27_param_4,
-	.param .u64 Unknown27_param_5,
-	.param .u64 Unknown27_param_6,
-	.param .u64 Unknown27_param_7,
-	.param .u64 Unknown27_param_8,
-	.param .u64 Unknown27_param_9,
-	.param .u64 Unknown27_param_10,
-	.param .u64 Unknown27_param_11,
-	.param .u64 Unknown27_param_12,
-	.param .u64 Unknown27_param_13,
-	.param .u64 Unknown27_param_14,
-	.param .u64 Unknown27_param_15,
-	.param .u64 Unknown27_param_16,
-	.param .u64 Unknown27_param_17,
-	.param .u64 Unknown27_param_18,
-	.param .u64 Unknown27_param_19,
-	.param .u64 Unknown27_param_20,
-	.param .u64 Unknown27_param_21,
-	.param .u64 Unknown27_param_22,
-	.param .u64 Unknown27_param_23,
-	.param .u64 Unknown27_param_24,
-	.param .u64 Unknown27_param_25,
-	.param .u64 Unknown27_param_26,
-	.param .u64 Unknown27_param_27,
-	.param .u64 Unknown27_param_28,
-	.param .u64 Unknown27_param_29,
-	.param .u64 Unknown27_param_30,
-	.param .u64 Unknown27_param_31,
-	.param .u64 Unknown27_param_32,
-	.param .u64 Unknown27_param_33,
-	.param .u64 Unknown27_param_34,
-	.param .u64 Unknown27_param_35,
-	.param .u64 Unknown27_param_36,
-	.param .u64 Unknown27_param_37,
-	.param .u64 Unknown27_param_38,
-	.param .u64 Unknown27_param_39,
-	.param .u64 Unknown27_param_40,
-	.param .u64 Unknown27_param_41,
-	.param .u64 Unknown27_param_42,
-	.param .u64 Unknown27_param_43
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<25>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 50175;
-	@%p1 bra 	$L__BB34_2;
-	ld.param.u64 	%rd6, [Unknown27_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown27_param_1];
-	ld.param.u64 	%rd8, [Unknown27_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown27_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 2;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 14;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 14;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 2;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 2;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 14;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 14;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 2;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 196;
-	mul.lo.s64 	%rd43, %rd34, 14;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB34_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd24, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd24, 100351;
+	@%p1 bra 	$L__BB18_3;
+	ld.param.u64 	%rd13, [Unknown38_param_34];
+	cvta.to.global.u64 	%rd1, %rd13;
+	ld.param.u64 	%rd14, [Unknown38_param_1];
+	ld.param.u64 	%rd15, [Unknown38_param_23];
+	cvta.to.global.u64 	%rd2, %rd15;
+	ld.param.u64 	%rd16, [Unknown38_param_12];
+	cvta.to.global.u64 	%rd3, %rd16;
+	cvta.to.global.u64 	%rd4, %rd14;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd6, %r2, %r4;
+	shl.b64 	%rd23, %rd24, 1;
+	shl.b64 	%rd8, %rd6, 1;
+$L__BB18_2:
+	add.s64 	%rd19, %rd4, %rd23;
+	ld.global.nc.u16 	%rs1, [%rd19];
+	add.s64 	%rd20, %rd3, %rd23;
+	ld.global.nc.u16 	%rs2, [%rd20];
+	add.s64 	%rd21, %rd2, %rd23;
+	ld.global.nc.u16 	%rs3, [%rd21];
+	add.rn.f16 	%rs4, %rs1, %rs2;
+	mov.b16 	%rs5, 0x0000;
+	setp.gt.f16 	%p2, %rs3, %rs5;
+	selp.b16 	%rs6, %rs4, 0x0000, %p2;
+	add.s64 	%rd22, %rd1, %rd23;
+	st.global.b16 	[%rd22], %rs6;
+	add.s64 	%rd24, %rd24, %rd6;
+	add.s64 	%rd23, %rd23, %rd8;
+	setp.lt.s64 	%p3, %rd24, 100352;
+	@%p3 bra 	$L__BB18_2;
+$L__BB18_3:
 	ret;
 
 }
@@ -3515,72 +1371,44 @@ $L__BB34_2:
 	.param .u64 Unknown23_param_32
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 50175;
-	@%p1 bra 	$L__BB35_2;
-	ld.param.u64 	%rd5, [Unknown23_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown23_param_1];
-	ld.param.u64 	%rd7, [Unknown23_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 2;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 14;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 14;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 2;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 2;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 14;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 14;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 2;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 196;
-	mul.lo.s64 	%rd41, %rd32, 14;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB35_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 50175;
+	@%p1 bra 	$L__BB19_3;
+	ld.param.u64 	%rd12, [Unknown23_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown23_param_1];
+	ld.param.u64 	%rd14, [Unknown23_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB19_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	mov.b16 	%rs3, 0x0000;
+	setp.gt.f16 	%p2, %rs1, %rs3;
+	selp.b16 	%rs4, %rs2, 0x0000, %p2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs4;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p3, %rd21, 50176;
+	@%p3 bra 	$L__BB19_2;
+$L__BB19_3:
 	ret;
 
 }
@@ -3632,183 +1460,49 @@ $L__BB35_2:
 	.param .u64 Unknown19_param_43
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 50175;
-	@%p1 bra 	$L__BB36_2;
-	ld.param.u64 	%rd6, [Unknown19_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown19_param_1];
-	ld.param.u64 	%rd8, [Unknown19_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown19_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 2;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 14;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 14;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 2;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 2;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 14;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 14;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.s64 	%rd39, %rd37, 2;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 196;
-	mul.lo.s64 	%rd43, %rd34, 14;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB36_2:
-	ret;
-
-}
-	// .globl	Unknown12
-.visible .entry Unknown12(
-	.param .u64 Unknown12_param_0,
-	.param .u64 Unknown12_param_1,
-	.param .u64 Unknown12_param_2,
-	.param .u64 Unknown12_param_3,
-	.param .u64 Unknown12_param_4,
-	.param .u64 Unknown12_param_5,
-	.param .u64 Unknown12_param_6,
-	.param .u64 Unknown12_param_7,
-	.param .u64 Unknown12_param_8,
-	.param .u64 Unknown12_param_9,
-	.param .u64 Unknown12_param_10,
-	.param .u64 Unknown12_param_11,
-	.param .u64 Unknown12_param_12,
-	.param .u64 Unknown12_param_13,
-	.param .u64 Unknown12_param_14,
-	.param .u64 Unknown12_param_15,
-	.param .u64 Unknown12_param_16,
-	.param .u64 Unknown12_param_17,
-	.param .u64 Unknown12_param_18,
-	.param .u64 Unknown12_param_19,
-	.param .u64 Unknown12_param_20,
-	.param .u64 Unknown12_param_21,
-	.param .u64 Unknown12_param_22,
-	.param .u64 Unknown12_param_23,
-	.param .u64 Unknown12_param_24,
-	.param .u64 Unknown12_param_25,
-	.param .u64 Unknown12_param_26,
-	.param .u64 Unknown12_param_27,
-	.param .u64 Unknown12_param_28,
-	.param .u64 Unknown12_param_29,
-	.param .u64 Unknown12_param_30,
-	.param .u64 Unknown12_param_31,
-	.param .u64 Unknown12_param_32
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<25>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 25087;
-	@%p1 bra 	$L__BB37_2;
-	ld.param.u64 	%rd5, [Unknown12_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown12_param_1];
-	ld.param.u64 	%rd7, [Unknown12_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 1;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 7;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 7;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 1;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 1;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 7;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 7;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.u64 	%rd37, %rd35, 1;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 49;
-	mul.lo.s64 	%rd41, %rd32, 7;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB37_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd24, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd24, 50175;
+	@%p1 bra 	$L__BB20_3;
+	ld.param.u64 	%rd13, [Unknown19_param_34];
+	cvta.to.global.u64 	%rd1, %rd13;
+	ld.param.u64 	%rd14, [Unknown19_param_1];
+	ld.param.u64 	%rd15, [Unknown19_param_23];
+	cvta.to.global.u64 	%rd2, %rd15;
+	ld.param.u64 	%rd16, [Unknown19_param_12];
+	cvta.to.global.u64 	%rd3, %rd16;
+	cvta.to.global.u64 	%rd4, %rd14;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd6, %r2, %r4;
+	shl.b64 	%rd23, %rd24, 1;
+	shl.b64 	%rd8, %rd6, 1;
+$L__BB20_2:
+	add.s64 	%rd19, %rd4, %rd23;
+	ld.global.nc.u16 	%rs1, [%rd19];
+	add.s64 	%rd20, %rd3, %rd23;
+	ld.global.nc.u16 	%rs2, [%rd20];
+	add.s64 	%rd21, %rd2, %rd23;
+	ld.global.nc.u16 	%rs3, [%rd21];
+	add.rn.f16 	%rs4, %rs1, %rs2;
+	mov.b16 	%rs5, 0x0000;
+	setp.gt.f16 	%p2, %rs3, %rs5;
+	selp.b16 	%rs6, %rs4, 0x0000, %p2;
+	add.s64 	%rd22, %rd1, %rd23;
+	st.global.b16 	[%rd22], %rs6;
+	add.s64 	%rd24, %rd24, %rd6;
+	add.s64 	%rd23, %rd23, %rd8;
+	setp.lt.s64 	%p3, %rd24, 50176;
+	@%p3 bra 	$L__BB20_2;
+$L__BB20_3:
 	ret;
 
 }
@@ -3860,77 +1554,49 @@ $L__BB37_2:
 	.param .u64 Unknown8_param_43
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<7>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<51>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<7>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<25>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd10, %r3;
-	mul.wide.s32 	%rd11, %r2, %r1;
-	add.s64 	%rd5, %rd11, %rd10;
-	setp.gt.s64 	%p1, %rd5, 25087;
-	@%p1 bra 	$L__BB38_2;
-	ld.param.u64 	%rd6, [Unknown8_param_34];
-	cvta.to.global.u64 	%rd1, %rd6;
-	ld.param.u64 	%rd7, [Unknown8_param_1];
-	ld.param.u64 	%rd8, [Unknown8_param_23];
-	cvta.to.global.u64 	%rd2, %rd8;
-	ld.param.u64 	%rd9, [Unknown8_param_12];
-	cvta.to.global.u64 	%rd3, %rd9;
-	cvta.to.global.u64 	%rd4, %rd7;
-	mul.hi.s64 	%rd12, %rd5, 5270498306774157605;
-	shr.u64 	%rd13, %rd12, 63;
-	shr.s64 	%rd14, %rd12, 1;
-	add.s64 	%rd15, %rd14, %rd13;
-	mul.lo.s64 	%rd16, %rd15, 7;
-	sub.s64 	%rd17, %rd5, %rd16;
-	setp.lt.s64 	%p2, %rd17, 0;
-	add.s64 	%rd18, %rd17, 7;
-	selp.b64 	%rd19, %rd18, %rd17, %p2;
-	shr.s64 	%rd20, %rd5, 63;
-	xor.b64  	%rd21, %rd20, %rd5;
-	mul.hi.s64 	%rd22, %rd21, 5270498306774157605;
-	shr.u64 	%rd23, %rd22, 63;
-	shr.s64 	%rd24, %rd22, 1;
-	add.s64 	%rd25, %rd24, %rd23;
-	xor.b64  	%rd26, %rd25, %rd20;
-	mul.hi.s64 	%rd27, %rd26, 5270498306774157605;
-	shr.u64 	%rd28, %rd27, 63;
-	shr.s64 	%rd29, %rd27, 1;
-	add.s64 	%rd30, %rd29, %rd28;
-	mul.lo.s64 	%rd31, %rd30, 7;
-	sub.s64 	%rd32, %rd26, %rd31;
-	setp.lt.s64 	%p3, %rd32, 0;
-	add.s64 	%rd33, %rd32, 7;
-	selp.b64 	%rd34, %rd33, %rd32, %p3;
-	shr.s64 	%rd35, %rd26, 63;
-	xor.b64  	%rd36, %rd35, %rd26;
-	mul.hi.s64 	%rd37, %rd36, 5270498306774157605;
-	shr.u64 	%rd38, %rd37, 63;
-	shr.u64 	%rd39, %rd37, 1;
-	add.s64 	%rd40, %rd39, %rd38;
-	xor.b64  	%rd41, %rd40, %rd35;
-	mul.lo.s64 	%rd42, %rd41, 49;
-	mul.lo.s64 	%rd43, %rd34, 7;
-	add.s64 	%rd44, %rd43, %rd19;
-	add.s64 	%rd45, %rd44, %rd42;
-	shl.b64 	%rd46, %rd45, 1;
-	add.s64 	%rd47, %rd2, %rd46;
-	ld.global.b16 	%h1, [%rd47];
-	add.s64 	%rd48, %rd4, %rd46;
-	ld.global.b16 	%h2, [%rd48];
-	add.s64 	%rd49, %rd3, %rd46;
-	ld.global.b16 	%h3, [%rd49];
-	add.rn.f16 	%h4, %h2, %h3;
-	mov.b16 	%h5, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h5;
-	selp.b16 	%h6, %h4, 0x0000, %p4;
-	add.s64 	%rd50, %rd1, %rd46;
-	st.global.b16 	[%rd50], %h6;
-$L__BB38_2:
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd24, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd24, 25087;
+	@%p1 bra 	$L__BB21_3;
+	ld.param.u64 	%rd13, [Unknown8_param_34];
+	cvta.to.global.u64 	%rd1, %rd13;
+	ld.param.u64 	%rd14, [Unknown8_param_1];
+	ld.param.u64 	%rd15, [Unknown8_param_23];
+	cvta.to.global.u64 	%rd2, %rd15;
+	ld.param.u64 	%rd16, [Unknown8_param_12];
+	cvta.to.global.u64 	%rd3, %rd16;
+	cvta.to.global.u64 	%rd4, %rd14;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd6, %r2, %r4;
+	shl.b64 	%rd23, %rd24, 1;
+	shl.b64 	%rd8, %rd6, 1;
+$L__BB21_2:
+	add.s64 	%rd19, %rd4, %rd23;
+	ld.global.nc.u16 	%rs1, [%rd19];
+	add.s64 	%rd20, %rd3, %rd23;
+	ld.global.nc.u16 	%rs2, [%rd20];
+	add.s64 	%rd21, %rd2, %rd23;
+	ld.global.nc.u16 	%rs3, [%rd21];
+	add.rn.f16 	%rs4, %rs1, %rs2;
+	mov.b16 	%rs5, 0x0000;
+	setp.gt.f16 	%p2, %rs3, %rs5;
+	selp.b16 	%rs6, %rs4, 0x0000, %p2;
+	add.s64 	%rd22, %rd1, %rd23;
+	st.global.b16 	[%rd22], %rs6;
+	add.s64 	%rd24, %rd24, %rd6;
+	add.s64 	%rd23, %rd23, %rd8;
+	setp.lt.s64 	%p3, %rd24, 25088;
+	@%p3 bra 	$L__BB21_2;
+$L__BB21_3:
 	ret;
 
 }
@@ -3971,72 +1637,44 @@ $L__BB38_2:
 	.param .u64 Unknown4_param_32
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<48>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<5>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 25087;
-	@%p1 bra 	$L__BB39_2;
-	ld.param.u64 	%rd5, [Unknown4_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown4_param_1];
-	ld.param.u64 	%rd7, [Unknown4_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 1;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 7;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 7;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 1;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 1;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 7;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 7;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.u64 	%rd37, %rd35, 1;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 49;
-	mul.lo.s64 	%rd41, %rd32, 7;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	mov.b16 	%h3, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h3;
-	selp.b16 	%h4, %h2, 0x0000, %p4;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB39_2:
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 25087;
+	@%p1 bra 	$L__BB22_3;
+	ld.param.u64 	%rd12, [Unknown4_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown4_param_1];
+	ld.param.u64 	%rd14, [Unknown4_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB22_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	mov.b16 	%rs3, 0x0000;
+	setp.gt.f16 	%p2, %rs1, %rs3;
+	selp.b16 	%rs4, %rs2, 0x0000, %p2;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs4;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p3, %rd21, 25088;
+	@%p3 bra 	$L__BB22_2;
+$L__BB22_3:
 	ret;
 
 }
@@ -4073,77 +1711,53 @@ $L__BB39_2:
 	.param .u64 Unknown0_param_28
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<6>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<4>;
+	.reg .b16 	%rs<6>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<49>;
+	.reg .b64 	%rd<27>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 25087;
-	@%p1 bra 	$L__BB40_2;
-	ld.param.u64 	%rd5, [Unknown0_param_19];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown0_param_1];
-	ld.param.u64 	%rd7, [Unknown0_param_8];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 1;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 7;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 7;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 1;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 1;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 7;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 7;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 1;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 49;
-	mul.lo.s64 	%rd41, %rd32, 7;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd2, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	shl.b64 	%rd46, %rd39, 1;
-	add.s64 	%rd47, %rd3, %rd46;
-	ld.global.b16 	%h2, [%rd47];
-	cvt.f32.f16 	%f1, %h2;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd26, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd26, 25087;
+	@%p1 bra 	$L__BB23_3;
+	ld.param.u64 	%rd12, [Unknown0_param_19];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown0_param_1];
+	ld.param.u64 	%rd14, [Unknown0_param_8];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd25, %rd26, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB23_2:
+	mul.hi.s64 	%rd17, %rd26, 6023426636313322977;
+	shr.u64 	%rd18, %rd17, 63;
+	shr.s64 	%rd19, %rd17, 4;
+	add.s64 	%rd20, %rd19, %rd18;
+	shl.b64 	%rd21, %rd20, 1;
+	add.s64 	%rd22, %rd3, %rd21;
+	ld.global.nc.u16 	%rs1, [%rd22];
+	add.s64 	%rd23, %rd2, %rd25;
+	ld.global.nc.u16 	%rs2, [%rd23];
+	cvt.f32.f16 	%f1, %rs1;
 	div.rn.f32 	%f2, %f1, 0f42440000;
-	cvt.rn.f16.f32 	%h3, %f2;
-	mov.b16 	%h4, 0x0000;
-	setp.gt.f16 	%p4, %h1, %h4;
-	selp.b16 	%h5, %h3, 0x0000, %p4;
-	add.s64 	%rd48, %rd1, %rd44;
-	st.global.b16 	[%rd48], %h5;
-$L__BB40_2:
+	cvt.rn.f16.f32 	%rs3, %f2;
+	mov.b16 	%rs4, 0x0000;
+	setp.gt.f16 	%p2, %rs2, %rs4;
+	selp.b16 	%rs5, %rs3, 0x0000, %p2;
+	add.s64 	%rd24, %rd1, %rd25;
+	st.global.b16 	[%rd24], %rs5;
+	add.s64 	%rd26, %rd26, %rd5;
+	add.s64 	%rd25, %rd25, %rd7;
+	setp.lt.s64 	%p3, %rd26, 25088;
+	@%p3 bra 	$L__BB23_2;
+$L__BB23_3:
 	ret;
 
 }
diff --git a/compiler/test/E2E/ResNet18/BW/host_output.mlir b/compiler/test/E2E/ResNet18/BW/host_output.mlir
index 0ad518caf..392748525 100644
--- a/compiler/test/E2E/ResNet18/BW/host_output.mlir
+++ b/compiler/test/E2E/ResNet18/BW/host_output.mlir
@@ -5,161 +5,157 @@
 module attributes {byre.container_module, gpu.container_module} {
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<128xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<128xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<128xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<128xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<128xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<128xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<128xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<256xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<256xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<256xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<256xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<256xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<256xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<256xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<256xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<256xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<256xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<512xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<512xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<512xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<512xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<512xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<512xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<512xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<512xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<512xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<512xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<64xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<64xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<64xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<64xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<64xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<64xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<64xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<64xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<64xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<64xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<128xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<128xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<128xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<128xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<128xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<128xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<128xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<128xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<512xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<512xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<512xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<512xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<512xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<512xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<512xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input123", byre.argtype = 1 : i32}, %arg124: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Input124", byre.argtype = 1 : i32}, %arg125: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Input125", byre.argtype = 1 : i32}, %arg126: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input126", byre.argtype = 1 : i32}, %arg127: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input127", byre.argtype = 1 : i32}, %arg128: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input128", byre.argtype = 1 : i32}, %arg129: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input129", byre.argtype = 1 : i32}, %arg130: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Input130", byre.argtype = 1 : i32}, %arg131: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input131", byre.argtype = 1 : i32}, %arg132: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input132", byre.argtype = 1 : i32}, %arg133: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input133", byre.argtype = 1 : i32}, %arg134: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input134", byre.argtype = 1 : i32}, %arg135: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input135", byre.argtype = 1 : i32}, %arg136: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Input136", byre.argtype = 1 : i32}, %arg137: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input137", byre.argtype = 1 : i32}, %arg138: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Input138", byre.argtype = 1 : i32}, %arg139: memref<1x512xf16, "cuda"> {byre.argname = "Input139", byre.argtype = 1 : i32}, %arg140: memref<512x1000xf16, "cuda"> {byre.argname = "Input140", byre.argtype = 1 : i32}, %arg141: memref<1x1000xf16, "cuda"> {byre.argname = "Input141", byre.argtype = 1 : i32}, %arg142: memref<64xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg143: memref<64xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg144: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg145: memref<1000xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg146: memref<1000x512xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg147: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg148: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg149: memref<64xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg150: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg151: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg152: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg153: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg154: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg155: memref<64xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg156: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg157: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg158: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg159: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg160: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg161: memref<128xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg162: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg163: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg164: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg165: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg166: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg167: memref<128xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg168: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg169: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg170: memref<128xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg171: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg172: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg173: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg174: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg175: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg176: memref<256xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg177: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg178: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg179: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg180: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg181: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg182: memref<256xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg183: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg187: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg188: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg189: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg190: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg191: memref<512xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg192: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg193: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg194: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg195: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg202: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg203: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
     %alloc = memref.alloc() : memref<25927680xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {device = "cuda", offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512xf16, "cuda">
     byre.compute @MatmulOp_f16f16_f16(%arg141, %arg140, %0) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x1000xf16, "cuda">, memref<512x1000xf16, "cuda">, memref<1x512xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {device = "cuda", offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {device = "cuda", offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg138, %1) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg137, %arg39, %1, %2, %arg201, %arg200) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %3 = "byre.alias"(%alloc) {device = "cuda", offset = 16540672 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 16540672 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%2, %arg136, %3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {device = "cuda", offset = 1671168 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 1671168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg135, %2, %4) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg135, %3, %2) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg134, %arg37, %2, %3, %arg199, %arg198) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {device = "cuda", offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%3, %arg133, %5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {device = "cuda", offset = 21209088 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 21209088 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg132, %3, %6) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {device = "cuda", offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%1, %5, %arg132, %7) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg129, %arg33, %7, %5, %arg192, %arg191) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %8 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%5, %arg128, %8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {device = "cuda", offset = 16490496 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 16490496 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg127, %5, %9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg127, %8, %5) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 10970112 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg126, %arg31, %5, %10, %arg190, %arg189) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg125, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {device = "cuda", offset = 14131200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 14131200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %10, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %13 = "byre.alias"(%alloc) {device = "cuda", offset = 12625920 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%10, %arg130, %13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {device = "cuda", offset = 819200 : i64} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %10, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%13, %11, %arg124, %15) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %15, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %16 = "byre.alias"(%alloc) {device = "cuda", offset = 11020288 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {device = "cuda", offset = 9740288 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg121, %16, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %16, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg119, %11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {device = "cuda", offset = 7380992 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %16, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%15, %11, %arg118, %19) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown27", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %19, %11, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg114, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {device = "cuda", offset = 8560640 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %11, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg113, %15, %11) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown31", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {device = "cuda", offset = 6490112 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %11, %21, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %22 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%21, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {device = "cuda", offset = 6791168 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %21, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {device = "cuda", offset = 11120640 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %19, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %25 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg131, %arg35, %7, %13, %arg197, %arg196) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 12625920 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%13, %arg130, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 819200 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg124, %13, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%14, %11, %arg124, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg123, %arg29, %16, %11, %arg186, %arg185) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 11020288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%11, %arg122, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 9740288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg121, %11, %18) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg121, %17, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg120, %arg27, %11, %17, %arg184, %arg183) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%17, %arg119, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 7380992 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg118, %17, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%16, %19, %arg118, %11) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg115, %arg23, %11, %16, %arg177, %arg176) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%16, %arg114, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 8560640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg113, %16, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg113, %14, %16) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg112, %arg21, %16, %14, %arg175, %arg174) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%14, %arg111, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 6791168 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %14, %23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 11120640 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg117, %arg25, %11, %24, %arg182, %arg181) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%24, %arg116, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {device = "cuda", offset = 311296 : i64} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 311296 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg110, %24, %26) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {device = "cuda", offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%25, %22, %arg110, %27) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg109, %arg19, %27, %25, %arg171, %arg170) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg108, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg107, %25, %28) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg107, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg106, %arg17, %25, %22, %arg169, %arg168) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%22, %arg105, %25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {device = "cuda", offset = 1376256 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1376256 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg104, %22, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%27, %25, %arg104, %30) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg101, %arg13, %30, %25, %arg162, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%25, %arg100, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {device = "cuda", offset = 1081344 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1081344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg99, %25, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {device = "cuda", offset = 6590464 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg99, %22, %25) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown42", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 6590464 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg98, %arg11, %25, %32, %arg160, %arg159) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%32, %arg97, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {device = "cuda", offset = 671744 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 671744 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %32, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {device = "cuda", offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg103, %arg15, %30, %35, %arg167, %arg166) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %36 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%35, %arg102, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {device = "cuda", offset = 294912 : i64} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 294912 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg96, %35, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {device = "cuda", offset = 6389760 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 6389760 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%36, %33, %arg96, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg95, %arg9, %38, %36, %arg156, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg94, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {device = "cuda", offset = 376832 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 598016 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg93, %36, %39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg93, %33, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg92, %arg7, %36, %33, %arg154, %arg153) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%33, %arg91, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {device = "cuda", offset = 524288 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 524288 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg90, %33, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%38, %36, %arg90, %33) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg89, %arg5, %33, %36, %arg150, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {device = "cuda", offset = 11321344 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 11321344 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%36, %arg88, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {device = "cuda", offset = 598016 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 376832 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg87, %36, %42) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %41, %36) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg86, %arg3, %36, %41, %arg148, %arg147) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%41, %arg85, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %43 = "byre.alias"(%alloc) {device = "cuda", offset = 450560 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 450560 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg84, %41, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%33, %36, %38) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxGradOp_f16f16_f16(%arg83, %38, %44) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%arg83, %44, %45) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%arg82, %arg1, %45, %44, %arg143, %arg142) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %46 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 10919936 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%arg81, %44, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1x1000xf32, "cuda">
-    byre.compute @PTXOp(%arg141, %47) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1x1000xf32, "cuda">
-    %48 = "byre.alias"(%alloc) {device = "cuda", offset = 10919936 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%47, %48) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%48, %arg145) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
-    %49 = "byre.alias"(%arg141) {device = "cuda", offset = 0 : i64} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {device = "cuda", offset = 12525568 : i64} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%49, %arg139, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%50, %arg146) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown81", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown88", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown97", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown98", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown99", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%arg141, %arg145) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 2 : i32]} : memref<1x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    %47 = "byre.alias"(%arg141) <{offset = 0 : i64}> {device = "cuda"} : (memref<1x1000xf16, "cuda">) -> memref<1000x1xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 12525568 : i64}> {device = "cuda"} : (memref<25927680xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%47, %arg139, %48) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<1000x1xf16, "cuda">, memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%48, %arg146) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg151) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg152) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg157) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%39, %arg158) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg163) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%12, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir b/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir
index efbda3322..54a161e09 100644
--- a/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir
+++ b/compiler/test/E2E/ResNet18/FW/10b_ptx_codegen.mlir
@@ -4,7 +4,7 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    llvm.func @Unknown100(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
+    llvm.func @Unknown92(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -12,8 +12,8 @@ module attributes {byre.container_module, gpu.container_module} {
       %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %8 = llvm.mlir.constant(0.899999976 : f32) : f32
       %9 = llvm.mlir.constant(512 : index) : i64
       %10 = nvvm.read.ptx.sreg.ctaid.x : i32
       %11 = llvm.sext %10 : i32 to i64
@@ -23,408 +23,30 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.sext %14 : i32 to i64
       %16 = llvm.mul %13, %11  : i64
       %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown99(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown98(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown97(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown96(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown95(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown94(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown93(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown92(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown91(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(512 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown90(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown89(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown88(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.mul %13, %19  : i64
+      llvm.br ^bb1(%17 : i64)
+    ^bb1(%21: i64):  // 2 preds: ^bb0, ^bb2
+      %22 = llvm.icmp "slt" %21, %9 : i64
+      llvm.cond_br %22, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %25 = llvm.load %24 : !llvm.ptr -> f32
+      %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %27 = llvm.load %26 : !llvm.ptr -> f32
+      %28 = llvm.fmul %25, %8  : f32
+      %29 = llvm.fmul %27, %7  : f32
+      %30 = llvm.fadd %29, %28  : f32
+      %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %30, %31 : f32, !llvm.ptr
+      %32 = llvm.add %21, %20  : i64
+      llvm.br ^bb1(%32 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown82(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -432,8 +54,8 @@ module attributes {byre.container_module, gpu.container_module} {
       %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %8 = llvm.mlir.constant(0.899999976 : f32) : f32
       %9 = llvm.mlir.constant(256 : index) : i64
       %10 = nvvm.read.ptx.sreg.ctaid.x : i32
       %11 = llvm.sext %10 : i32 to i64
@@ -443,23 +65,30 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.sext %14 : i32 to i64
       %16 = llvm.mul %13, %11  : i64
       %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown87(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.mul %13, %19  : i64
+      llvm.br ^bb1(%17 : i64)
+    ^bb1(%21: i64):  // 2 preds: ^bb0, ^bb2
+      %22 = llvm.icmp "slt" %21, %9 : i64
+      llvm.cond_br %22, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %25 = llvm.load %24 : !llvm.ptr -> f32
+      %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %27 = llvm.load %26 : !llvm.ptr -> f32
+      %28 = llvm.fmul %25, %8  : f32
+      %29 = llvm.fmul %27, %7  : f32
+      %30 = llvm.fadd %29, %28  : f32
+      %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %30, %31 : f32, !llvm.ptr
+      %32 = llvm.add %21, %20  : i64
+      llvm.br ^bb1(%32 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown72(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -467,9 +96,9 @@ module attributes {byre.container_module, gpu.container_module} {
       %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
+      %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %8 = llvm.mlir.constant(0.899999976 : f32) : f32
+      %9 = llvm.mlir.constant(128 : index) : i64
       %10 = nvvm.read.ptx.sreg.ctaid.x : i32
       %11 = llvm.sext %10 : i32 to i64
       %12 = nvvm.read.ptx.sreg.ntid.x : i32
@@ -478,23 +107,30 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.sext %14 : i32 to i64
       %16 = llvm.mul %13, %11  : i64
       %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown86(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.mul %13, %19  : i64
+      llvm.br ^bb1(%17 : i64)
+    ^bb1(%21: i64):  // 2 preds: ^bb0, ^bb2
+      %22 = llvm.icmp "slt" %21, %9 : i64
+      llvm.cond_br %22, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %25 = llvm.load %24 : !llvm.ptr -> f32
+      %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %27 = llvm.load %26 : !llvm.ptr -> f32
+      %28 = llvm.fmul %25, %8  : f32
+      %29 = llvm.fmul %27, %7  : f32
+      %30 = llvm.fadd %29, %28  : f32
+      %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %30, %31 : f32, !llvm.ptr
+      %32 = llvm.add %21, %20  : i64
+      llvm.br ^bb1(%32 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown62(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr {llvm.noalias}, %arg11: !llvm.ptr {llvm.noalias}, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
@@ -502,9 +138,9 @@ module attributes {byre.container_module, gpu.container_module} {
       %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
+      %7 = llvm.mlir.constant(1.000000e-01 : f32) : f32
+      %8 = llvm.mlir.constant(0.899999976 : f32) : f32
+      %9 = llvm.mlir.constant(64 : index) : i64
       %10 = nvvm.read.ptx.sreg.ctaid.x : i32
       %11 = llvm.sext %10 : i32 to i64
       %12 = nvvm.read.ptx.sreg.ntid.x : i32
@@ -513,2246 +149,182 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.sext %14 : i32 to i64
       %16 = llvm.mul %13, %11  : i64
       %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown85(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.mul %13, %19  : i64
+      llvm.br ^bb1(%17 : i64)
+    ^bb1(%21: i64):  // 2 preds: ^bb0, ^bb2
+      %22 = llvm.icmp "slt" %21, %9 : i64
+      llvm.cond_br %22, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %24 = llvm.getelementptr %arg6[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %25 = llvm.load %24 : !llvm.ptr -> f32
+      %26 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %27 = llvm.load %26 : !llvm.ptr -> f32
+      %28 = llvm.fmul %25, %8  : f32
+      %29 = llvm.fmul %27, %7  : f32
+      %30 = llvm.fadd %29, %28  : f32
+      %31 = llvm.getelementptr %arg11[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %30, %31 : f32, !llvm.ptr
+      %32 = llvm.add %21, %20  : i64
+      llvm.br ^bb1(%32 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown61(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr {llvm.noalias}, %arg6: !llvm.ptr {llvm.noalias}, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr {llvm.noalias}, %arg13: !llvm.ptr {llvm.noalias}, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
+      %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %12 = llvm.mlir.constant(0 : index) : i64
+      %13 = llvm.mlir.constant(1000 : index) : i64
+      %14 = nvvm.read.ptx.sreg.ctaid.x : i32
       %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown84(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown83(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown82(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown81(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(256 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown80(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown79(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown78(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown77(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown76(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown75(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown74(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown73(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown72(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown71(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(128 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown70(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown69(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown68(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown67(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown66(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown65(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown64(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown63(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown62(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown61(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.insertvalue %arg5, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %4 = llvm.insertvalue %arg6, %3[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %5 = llvm.insertvalue %arg10, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %6 = llvm.insertvalue %arg11, %5[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %7 = llvm.mlir.constant(0.899999976 : f32) : f32
-      %8 = llvm.mlir.constant(1.000000e-01 : f32) : f32
-      %9 = llvm.mlir.constant(64 : index) : i64
-      %10 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %11 = llvm.sext %10 : i32 to i64
-      %12 = nvvm.read.ptx.sreg.ntid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.tid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = llvm.mul %13, %11  : i64
-      %17 = llvm.add %15, %16  : i64
-      %18 = llvm.icmp "slt" %17, %9 : i64
-      llvm.cond_br %18, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %19 = llvm.getelementptr %arg1[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %20 = llvm.load %19 : !llvm.ptr -> f32
-      %21 = llvm.getelementptr %arg6[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %22 = llvm.load %21 : !llvm.ptr -> f32
-      %23 = llvm.fmul %22, %7  : f32
-      %24 = llvm.fmul %20, %8  : f32
-      %25 = llvm.fadd %24, %23  : f32
-      %26 = llvm.getelementptr %arg11[%17] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      llvm.store %25, %26 : f32, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown60(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: !llvm.ptr, %arg13: !llvm.ptr, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
-      %3 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %4 = llvm.insertvalue %arg5, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg6, %4[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg7, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg12, %3[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.insertvalue %arg13, %8[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %10 = llvm.insertvalue %arg14, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %11 = llvm.insertvalue %arg15, %10[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %12 = llvm.mlir.constant(0 : index) : i64
-      %13 = llvm.mlir.constant(1000 : index) : i64
-      %14 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = nvvm.read.ptx.sreg.ntid.x : i32
-      %17 = llvm.sext %16 : i32 to i64
-      %18 = nvvm.read.ptx.sreg.tid.x : i32
-      %19 = llvm.sext %18 : i32 to i64
-      %20 = llvm.mul %17, %15  : i64
-      %21 = llvm.add %19, %20  : i64
-      %22 = llvm.icmp "slt" %21, %13 : i64
-      llvm.cond_br %22, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %23 = llvm.mul %12, %13  : i64
-      %24 = llvm.add %23, %21  : i64
-      %25 = llvm.getelementptr %arg6[%24] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %26 = llvm.load %25 : !llvm.ptr -> f16
-      %27 = llvm.getelementptr %arg1[%21] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %28 = llvm.load %27 : !llvm.ptr -> f32
-      %29 = llvm.fptrunc %28 : f32 to f16
-      %30 = llvm.fadd %26, %29  : f16
-      %31 = llvm.getelementptr %arg13[%24] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %30, %31 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown59(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.mlir.constant(0 : index) : i64
-      %10 = llvm.mlir.constant(512000 : index) : i64
-      %11 = llvm.mlir.constant(512 : index) : i64
-      %12 = llvm.mlir.constant(-1 : index) : i64
-      %13 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %14 = llvm.sext %13 : i32 to i64
-      %15 = nvvm.read.ptx.sreg.ntid.x : i32
-      %16 = llvm.sext %15 : i32 to i64
-      %17 = nvvm.read.ptx.sreg.tid.x : i32
-      %18 = llvm.sext %17 : i32 to i64
-      %19 = llvm.mul %16, %14  : i64
-      %20 = llvm.add %18, %19  : i64
-      %21 = llvm.icmp "slt" %20, %10 : i64
-      llvm.cond_br %21, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %22 = llvm.srem %20, %11  : i64
-      %23 = llvm.icmp "slt" %22, %9 : i64
-      %24 = llvm.add %22, %11  : i64
-      %25 = llvm.select %23, %24, %22 : i1, i64
-      %26 = llvm.icmp "slt" %20, %9 : i64
-      %27 = llvm.sub %12, %20  : i64
-      %28 = llvm.select %26, %27, %20 : i1, i64
-      %29 = llvm.sdiv %28, %11  : i64
-      %30 = llvm.sub %12, %29  : i64
-      %31 = llvm.select %26, %30, %29 : i1, i64
-      %32 = llvm.mul %31, %11  : i64
-      %33 = llvm.add %32, %25  : i64
-      %34 = llvm.getelementptr %arg1[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %35 = llvm.load %34 : !llvm.ptr -> f32
-      %36 = llvm.fptrunc %35 : f32 to f16
-      %37 = llvm.getelementptr %arg8[%33] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %36, %37 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown58(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
-      %9 = llvm.mlir.constant(2.040100e-02 : f16) : f16
-      %10 = llvm.mlir.constant(0 : index) : i64
-      %11 = llvm.mlir.constant(512 : index) : i64
-      %12 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %13 = llvm.sext %12 : i32 to i64
-      %14 = nvvm.read.ptx.sreg.ntid.x : i32
-      %15 = llvm.sext %14 : i32 to i64
-      %16 = nvvm.read.ptx.sreg.tid.x : i32
-      %17 = llvm.sext %16 : i32 to i64
-      %18 = llvm.mul %15, %13  : i64
-      %19 = llvm.add %17, %18  : i64
-      %20 = llvm.icmp "slt" %19, %11 : i64
-      llvm.cond_br %20, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %21 = llvm.mul %10, %11  : i64
-      %22 = llvm.add %21, %19  : i64
-      %23 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %24 = llvm.load %23 : !llvm.ptr -> f16
-      %25 = llvm.fmul %24, %9  : f16
-      %26 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %25, %26 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown57(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(25088 : index) : i64
-      %28 = llvm.mlir.constant(7 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(49 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown55(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown54(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(25088 : index) : i64
-      %20 = llvm.mlir.constant(7 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(49 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown52(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown51(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(25088 : index) : i64
-      %28 = llvm.mlir.constant(7 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(49 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown49(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(2359296 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(512 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(4608 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown48(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(25088 : index) : i64
-      %20 = llvm.mlir.constant(7 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(49 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown46(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(1179648 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown44(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(131072 : index) : i64
-      %19 = llvm.mlir.constant(256 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
-      %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
-      %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %45 = llvm.load %44 : !llvm.ptr -> f32
-      %46 = llvm.fptrunc %45 : f32 to f16
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %46, %47 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown43(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(50176 : index) : i64
-      %28 = llvm.mlir.constant(14 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(196 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown41(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown40(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(50176 : index) : i64
-      %20 = llvm.mlir.constant(14 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(196 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown38(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown37(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(50176 : index) : i64
-      %28 = llvm.mlir.constant(14 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(196 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown35(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(589824 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(256 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(2304 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown34(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(50176 : index) : i64
-      %20 = llvm.mlir.constant(14 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %16 = nvvm.read.ptx.sreg.ntid.x : i32
+      %17 = llvm.sext %16 : i32 to i64
+      %18 = nvvm.read.ptx.sreg.tid.x : i32
+      %19 = llvm.sext %18 : i32 to i64
+      %20 = llvm.mul %17, %15  : i64
+      %21 = llvm.add %19, %20  : i64
+      %22 = nvvm.read.ptx.sreg.nctaid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(196 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown32(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %24 = llvm.mul %17, %23  : i64
+      llvm.br ^bb1(%21 : i64)
+    ^bb1(%25: i64):  // 2 preds: ^bb0, ^bb2
+      %26 = llvm.icmp "slt" %25, %13 : i64
+      llvm.cond_br %26, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %27 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %28 = llvm.mlir.constant(1 : index) : i64
+      %29 = llvm.getelementptr %arg1[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %30 = llvm.load %29 : !llvm.ptr -> f32
+      %31 = llvm.insertvalue %25, %5[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %32 = llvm.insertvalue %28, %31[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %33 = llvm.getelementptr %arg6[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %34 = llvm.mul %12, %13  : i64
+      %35 = llvm.add %34, %12  : i64
+      %36 = llvm.getelementptr %33[%35] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %37 = llvm.load %36 : !llvm.ptr -> f16
+      %38 = llvm.fptrunc %30 : f32 to f16
+      %39 = llvm.fadd %37, %38  : f16
+      %40 = llvm.insertvalue %25, %9[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %41 = llvm.insertvalue %28, %40[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %42 = llvm.getelementptr %arg13[%25] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %43 = llvm.getelementptr %42[%35] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %39, %43 : f16, !llvm.ptr
+      %44 = llvm.add %25, %24  : i64
+      llvm.br ^bb1(%44 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown60(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %9 = llvm.mlir.constant(0 : index) : i64
+      %10 = llvm.mlir.constant(512000 : index) : i64
+      %11 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %12 = llvm.sext %11 : i32 to i64
+      %13 = nvvm.read.ptx.sreg.ntid.x : i32
+      %14 = llvm.sext %13 : i32 to i64
+      %15 = nvvm.read.ptx.sreg.tid.x : i32
+      %16 = llvm.sext %15 : i32 to i64
+      %17 = llvm.mul %14, %12  : i64
+      %18 = llvm.add %16, %17  : i64
+      %19 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = llvm.mul %14, %20  : i64
+      llvm.br ^bb1(%18 : i64)
+    ^bb1(%22: i64):  // 2 preds: ^bb0, ^bb2
+      %23 = llvm.icmp "slt" %22, %10 : i64
+      llvm.cond_br %23, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %24 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %25 = llvm.insertvalue %22, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %26 = llvm.mlir.constant(1 : index) : i64
+      %27 = llvm.insertvalue %26, %25[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %28 = llvm.mlir.constant(512 : index) : i64
+      %29 = llvm.getelementptr %arg1[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %30 = llvm.mul %9, %28  : i64
+      %31 = llvm.add %30, %9  : i64
+      %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %33 = llvm.load %32 : !llvm.ptr -> f32
+      %34 = llvm.fptrunc %33 : f32 to f16
+      %35 = llvm.insertvalue %22, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %36 = llvm.insertvalue %26, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %37 = llvm.getelementptr %arg8[%22] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %34, %38 : f16, !llvm.ptr
+      %39 = llvm.add %22, %21  : i64
+      llvm.br ^bb1(%39 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown59(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.insertvalue %arg7, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %6 = llvm.insertvalue %arg8, %5[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %7 = llvm.insertvalue %arg9, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %8 = llvm.insertvalue %arg10, %7[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %9 = llvm.mlir.constant(2.040100e-02 : f16) : f16
+      %10 = llvm.mlir.constant(0 : index) : i64
+      %11 = llvm.mlir.constant(512 : index) : i64
+      %12 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %13 = llvm.sext %12 : i32 to i64
+      %14 = nvvm.read.ptx.sreg.ntid.x : i32
+      %15 = llvm.sext %14 : i32 to i64
+      %16 = nvvm.read.ptx.sreg.tid.x : i32
+      %17 = llvm.sext %16 : i32 to i64
+      %18 = llvm.mul %15, %13  : i64
+      %19 = llvm.add %17, %18  : i64
+      %20 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = llvm.mul %15, %21  : i64
+      llvm.br ^bb1(%19 : i64)
+    ^bb1(%23: i64):  // 2 preds: ^bb0, ^bb2
+      %24 = llvm.icmp "slt" %23, %11 : i64
+      llvm.cond_br %24, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %25 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %26 = llvm.insertvalue %23, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %27 = llvm.mlir.constant(1 : index) : i64
+      %28 = llvm.insertvalue %27, %26[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %29 = llvm.getelementptr %arg1[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %30 = llvm.mul %10, %11  : i64
+      %31 = llvm.add %30, %10  : i64
+      %32 = llvm.getelementptr %29[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %33 = llvm.load %32 : !llvm.ptr -> f16
+      %34 = llvm.fmul %33, %9  : f16
+      %35 = llvm.insertvalue %23, %6[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %36 = llvm.insertvalue %27, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %37 = llvm.getelementptr %arg8[%23] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %38 = llvm.getelementptr %37[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %34, %38 : f16, !llvm.ptr
+      %39 = llvm.add %23, %22  : i64
+      llvm.br ^bb1(%39 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown51(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2770,70 +342,78 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(294912 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown30(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %25 = llvm.mlir.constant(25088 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
+      %31 = llvm.sext %30 : i32 to i64
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
+      %33 = llvm.sext %32 : i32 to i64
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(49 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(7 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fadd %59, %68  : f16
+      %70 = llvm.intr.maximum(%69, %26)  : (f16, f16) -> f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown49(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2852,125 +432,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(32768 : index) : i64
-      %19 = llvm.mlir.constant(128 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %18 = llvm.mlir.constant(2359296 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
-      %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %45 = llvm.load %44 : !llvm.ptr -> f32
-      %46 = llvm.fptrunc %45 : f32 to f16
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %46, %47 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown29(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
-      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
-      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %5 = llvm.insertvalue %arg7, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %6 = llvm.insertvalue %arg4, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %7 = llvm.insertvalue %arg8, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %8 = llvm.insertvalue %arg5, %7[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %9 = llvm.insertvalue %arg11, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %10 = llvm.insertvalue %arg12, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %11 = llvm.insertvalue %arg13, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %12 = llvm.insertvalue %arg14, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %13 = llvm.insertvalue %arg18, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(100352 : index) : i64
-      %28 = llvm.mlir.constant(28 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(784 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown27(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(4608 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown48(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -2988,70 +502,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(25088 : index) : i64
+      %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %19 = llvm.mlir.constant(0 : index) : i64
+      %20 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = nvvm.read.ptx.sreg.ntid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
+      %24 = nvvm.read.ptx.sreg.tid.x : i32
       %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
+      %26 = llvm.mul %23, %21  : i64
+      %27 = llvm.add %25, %26  : i64
+      %28 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = llvm.mul %23, %29  : i64
+      llvm.br ^bb1(%27 : i64)
+    ^bb1(%31: i64):  // 2 preds: ^bb0, ^bb2
       %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown26(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      llvm.cond_br %32, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %35 = llvm.mlir.constant(1 : index) : i64
+      %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(49 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(7 : index) : i64
+      %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %19, %17  : i64
+      %45 = llvm.mul %19, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %19, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %19  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.intr.maximum(%51, %18)  : (f16, f16) -> f16
+      %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %31, %30  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown46(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3069,59 +573,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(100352 : index) : i64
-      %20 = llvm.mlir.constant(28 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(784 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown24(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %17 = llvm.mlir.constant(0 : index) : i64
+      %18 = llvm.mlir.constant(1179648 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(2304 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown44(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3139,70 +644,56 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown23(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %17 = llvm.mlir.constant(131072 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %17 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(256 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %47 = llvm.load %46 : !llvm.ptr -> f32
+      %48 = llvm.fptrunc %47 : f32 to f16
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %48, %56 : f16, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown37(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3228,62 +719,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(100352 : index) : i64
-      %28 = llvm.mlir.constant(28 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(50176 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(784 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown21(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(196 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(14 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fadd %59, %68  : f16
+      %70 = llvm.intr.maximum(%69, %26)  : (f16, f16) -> f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown35(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3302,69 +801,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(147456 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(128 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(1152 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown20(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = llvm.mlir.constant(589824 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(2304 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown34(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3382,59 +871,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(100352 : index) : i64
-      %20 = llvm.mlir.constant(28 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(50176 : index) : i64
+      %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %19 = llvm.mlir.constant(0 : index) : i64
+      %20 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = nvvm.read.ptx.sreg.ntid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
+      %24 = nvvm.read.ptx.sreg.tid.x : i32
       %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(784 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown18(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %26 = llvm.mul %23, %21  : i64
+      %27 = llvm.add %25, %26  : i64
+      %28 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = llvm.mul %23, %29  : i64
+      llvm.br ^bb1(%27 : i64)
+    ^bb1(%31: i64):  // 2 preds: ^bb0, ^bb2
+      %32 = llvm.icmp "slt" %31, %17 : i64
+      llvm.cond_br %32, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %35 = llvm.mlir.constant(1 : index) : i64
+      %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(196 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(14 : index) : i64
+      %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %19, %17  : i64
+      %45 = llvm.mul %19, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %19, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %19  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.intr.maximum(%51, %18)  : (f16, f16) -> f16
+      %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %31, %30  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown32(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3453,69 +943,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(73728 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown16(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = llvm.mlir.constant(294912 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(1152 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown30(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3533,45 +1013,56 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(8192 : index) : i64
-      %19 = llvm.mlir.constant(64 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(32768 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
       %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.mul %39, %19  : i64
-      %41 = llvm.add %40, %33  : i64
-      %42 = llvm.add %41, %17  : i64
-      %43 = llvm.add %42, %17  : i64
-      %44 = llvm.getelementptr %arg1[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %45 = llvm.load %44 : !llvm.ptr -> f32
-      %46 = llvm.fptrunc %45 : f32 to f16
-      %47 = llvm.getelementptr %arg12[%43] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %46, %47 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown15(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(128 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %47 = llvm.load %46 : !llvm.ptr -> f32
+      %48 = llvm.fptrunc %47 : f32 to f16
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %48, %56 : f16, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown23(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3597,62 +1088,70 @@ module attributes {byre.container_module, gpu.container_module} {
       %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(200704 : index) : i64
-      %28 = llvm.mlir.constant(56 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %25 = llvm.mlir.constant(100352 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
       %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
       %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(3136 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown13(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(784 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(28 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fadd %59, %68  : f16
+      %70 = llvm.intr.maximum(%69, %26)  : (f16, f16) -> f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown21(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3671,69 +1170,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown12(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = llvm.mlir.constant(147456 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(1152 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown20(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3751,59 +1240,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(200704 : index) : i64
-      %20 = llvm.mlir.constant(56 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(100352 : index) : i64
+      %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %19 = llvm.mlir.constant(0 : index) : i64
+      %20 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = nvvm.read.ptx.sreg.ntid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
+      %24 = nvvm.read.ptx.sreg.tid.x : i32
       %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(3136 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown10(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %26 = llvm.mul %23, %21  : i64
+      %27 = llvm.add %25, %26  : i64
+      %28 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = llvm.mul %23, %29  : i64
+      llvm.br ^bb1(%27 : i64)
+    ^bb1(%31: i64):  // 2 preds: ^bb0, ^bb2
+      %32 = llvm.icmp "slt" %31, %17 : i64
+      llvm.cond_br %32, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %35 = llvm.mlir.constant(1 : index) : i64
+      %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(784 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(28 : index) : i64
+      %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %19, %17  : i64
+      %45 = llvm.mul %19, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %19, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %19  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.intr.maximum(%51, %18)  : (f16, f16) -> f16
+      %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %31, %30  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown18(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3822,69 +1312,59 @@ module attributes {byre.container_module, gpu.container_module} {
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown9(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr, %arg23: !llvm.ptr, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %18 = llvm.mlir.constant(73728 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(576 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown16(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3902,70 +1382,56 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %25 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %26 = llvm.mlir.constant(0 : index) : i64
-      %27 = llvm.mlir.constant(200704 : index) : i64
-      %28 = llvm.mlir.constant(56 : index) : i64
-      %29 = llvm.mlir.constant(-1 : index) : i64
-      %30 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %31 = llvm.sext %30 : i32 to i64
-      %32 = nvvm.read.ptx.sreg.ntid.x : i32
-      %33 = llvm.sext %32 : i32 to i64
-      %34 = nvvm.read.ptx.sreg.tid.x : i32
-      %35 = llvm.sext %34 : i32 to i64
-      %36 = llvm.mul %33, %31  : i64
-      %37 = llvm.add %35, %36  : i64
-      %38 = llvm.icmp "slt" %37, %27 : i64
-      llvm.cond_br %38, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %39 = llvm.srem %37, %28  : i64
-      %40 = llvm.icmp "slt" %39, %26 : i64
-      %41 = llvm.add %39, %28  : i64
-      %42 = llvm.select %40, %41, %39 : i1, i64
-      %43 = llvm.icmp "slt" %37, %26 : i64
-      %44 = llvm.sub %29, %37  : i64
-      %45 = llvm.select %43, %44, %37 : i1, i64
-      %46 = llvm.sdiv %45, %28  : i64
-      %47 = llvm.sub %29, %46  : i64
-      %48 = llvm.select %43, %47, %46 : i1, i64
-      %49 = llvm.srem %48, %28  : i64
-      %50 = llvm.icmp "slt" %49, %26 : i64
-      %51 = llvm.add %49, %28  : i64
-      %52 = llvm.select %50, %51, %49 : i1, i64
-      %53 = llvm.icmp "slt" %48, %26 : i64
-      %54 = llvm.sub %29, %48  : i64
-      %55 = llvm.select %53, %54, %48 : i1, i64
-      %56 = llvm.sdiv %55, %28  : i64
-      %57 = llvm.sub %29, %56  : i64
-      %58 = llvm.select %53, %57, %56 : i1, i64
-      %59 = llvm.mul %26, %27  : i64
-      %60 = llvm.mlir.constant(3136 : index) : i64
-      %61 = llvm.mul %58, %60  : i64
-      %62 = llvm.add %59, %61  : i64
-      %63 = llvm.mul %52, %28  : i64
-      %64 = llvm.add %62, %63  : i64
-      %65 = llvm.add %64, %42  : i64
-      %66 = llvm.getelementptr %arg1[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %67 = llvm.load %66 : !llvm.ptr -> f16
-      %68 = llvm.getelementptr %arg12[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %69 = llvm.load %68 : !llvm.ptr -> f16
-      %70 = llvm.fadd %67, %69  : f16
-      %71 = llvm.intr.maxnum(%70, %25)  : (f16, f16) -> f16
-      %72 = llvm.getelementptr %arg23[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown7(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %17 = llvm.mlir.constant(8192 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %17 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(64 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.insertvalue %34, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %42 = llvm.mul %18, %36  : i64
+      %43 = llvm.add %42, %18  : i64
+      %44 = llvm.add %43, %18  : i64
+      %45 = llvm.add %44, %18  : i64
+      %46 = llvm.getelementptr %41[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %47 = llvm.load %46 : !llvm.ptr -> f32
+      %48 = llvm.fptrunc %47 : f32 to f16
+      %49 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.insertvalue %34, %49[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %51 = llvm.insertvalue %36, %50[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %52 = llvm.insertvalue %34, %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %56 = llvm.getelementptr %55[%45] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %48, %56 : f16, !llvm.ptr
+      %57 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%57 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown9(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64, %arg22: !llvm.ptr {llvm.noalias}, %arg23: !llvm.ptr {llvm.noalias}, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: i64, %arg28: i64, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -3983,70 +1449,78 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown6(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %17 = llvm.insertvalue %arg22, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %18 = llvm.insertvalue %arg23, %17[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %19 = llvm.insertvalue %arg24, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %20 = llvm.insertvalue %arg25, %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %21 = llvm.insertvalue %arg29, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %22 = llvm.insertvalue %arg26, %21[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %23 = llvm.insertvalue %arg30, %22[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %24 = llvm.insertvalue %arg27, %23[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %25 = llvm.mlir.constant(200704 : index) : i64
+      %26 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %27 = llvm.mlir.constant(0 : index) : i64
+      %28 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = nvvm.read.ptx.sreg.ntid.x : i32
+      %31 = llvm.sext %30 : i32 to i64
+      %32 = nvvm.read.ptx.sreg.tid.x : i32
+      %33 = llvm.sext %32 : i32 to i64
+      %34 = llvm.mul %31, %29  : i64
+      %35 = llvm.add %33, %34  : i64
+      %36 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %37 = llvm.sext %36 : i32 to i64
+      %38 = llvm.mul %31, %37  : i64
+      llvm.br ^bb1(%35 : i64)
+    ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb2
+      %40 = llvm.icmp "slt" %39, %25 : i64
+      llvm.cond_br %40, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %41 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %42 = llvm.insertvalue %39, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %43 = llvm.mlir.constant(1 : index) : i64
+      %44 = llvm.insertvalue %43, %42[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %45 = llvm.insertvalue %25, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %46 = llvm.insertvalue %43, %45[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %47 = llvm.mlir.constant(3136 : index) : i64
+      %48 = llvm.insertvalue %47, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %49 = llvm.insertvalue %43, %48[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %50 = llvm.mlir.constant(56 : index) : i64
+      %51 = llvm.getelementptr %arg1[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %52 = llvm.mul %27, %25  : i64
+      %53 = llvm.mul %27, %47  : i64
+      %54 = llvm.add %52, %53  : i64
+      %55 = llvm.mul %27, %50  : i64
+      %56 = llvm.add %54, %55  : i64
+      %57 = llvm.add %56, %27  : i64
+      %58 = llvm.getelementptr %51[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.load %58 : !llvm.ptr -> f16
+      %60 = llvm.insertvalue %39, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %61 = llvm.insertvalue %43, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %62 = llvm.insertvalue %25, %61[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %63 = llvm.insertvalue %43, %62[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %64 = llvm.insertvalue %47, %63[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %65 = llvm.insertvalue %43, %64[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %66 = llvm.getelementptr %arg12[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.getelementptr %66[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %68 = llvm.load %67 : !llvm.ptr -> f16
+      %69 = llvm.fadd %59, %68  : f16
+      %70 = llvm.intr.maximum(%69, %26)  : (f16, f16) -> f16
+      %71 = llvm.insertvalue %39, %18[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %72 = llvm.insertvalue %43, %71[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %73 = llvm.insertvalue %25, %72[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %74 = llvm.insertvalue %43, %73[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %75 = llvm.insertvalue %47, %74[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %76 = llvm.insertvalue %43, %75[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %77 = llvm.getelementptr %arg23[%39] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %78 = llvm.getelementptr %77[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %70, %78 : f16, !llvm.ptr
+      %79 = llvm.add %39, %38  : i64
+      llvm.br ^bb1(%79 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown6(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -4064,59 +1538,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(200704 : index) : i64
-      %20 = llvm.mlir.constant(56 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(200704 : index) : i64
+      %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %19 = llvm.mlir.constant(0 : index) : i64
+      %20 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = nvvm.read.ptx.sreg.ntid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
+      %24 = nvvm.read.ptx.sreg.tid.x : i32
       %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(3136 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown4(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %26 = llvm.mul %23, %21  : i64
+      %27 = llvm.add %25, %26  : i64
+      %28 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = llvm.mul %23, %29  : i64
+      llvm.br ^bb1(%27 : i64)
+    ^bb1(%31: i64):  // 2 preds: ^bb0, ^bb2
+      %32 = llvm.icmp "slt" %31, %17 : i64
+      llvm.cond_br %32, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %35 = llvm.mlir.constant(1 : index) : i64
+      %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(3136 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(56 : index) : i64
+      %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %19, %17  : i64
+      %45 = llvm.mul %19, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %19, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %19  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.intr.maximum(%51, %18)  : (f16, f16) -> f16
+      %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %31, %30  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown4(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -4136,68 +1611,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(36864 : index) : i64
-      %19 = llvm.mlir.constant(3 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(64 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(576 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(9 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown3(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(576 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(9 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(3 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown3(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -4215,59 +1680,60 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0.000000e+00 : f16) : f16
-      %18 = llvm.mlir.constant(0 : index) : i64
-      %19 = llvm.mlir.constant(802816 : index) : i64
-      %20 = llvm.mlir.constant(112 : index) : i64
-      %21 = llvm.mlir.constant(-1 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(802816 : index) : i64
+      %18 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %19 = llvm.mlir.constant(0 : index) : i64
+      %20 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %21 = llvm.sext %20 : i32 to i64
+      %22 = nvvm.read.ptx.sreg.ntid.x : i32
       %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
+      %24 = nvvm.read.ptx.sreg.tid.x : i32
       %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %19 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %20  : i64
-      %32 = llvm.icmp "slt" %31, %18 : i64
-      %33 = llvm.add %31, %20  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %18 : i64
-      %36 = llvm.sub %21, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %20  : i64
-      %39 = llvm.sub %21, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %20  : i64
-      %42 = llvm.icmp "slt" %41, %18 : i64
-      %43 = llvm.add %41, %20  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %18 : i64
-      %46 = llvm.sub %21, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %20  : i64
-      %49 = llvm.sub %21, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.mul %18, %19  : i64
-      %52 = llvm.mlir.constant(12544 : index) : i64
-      %53 = llvm.mul %50, %52  : i64
-      %54 = llvm.add %51, %53  : i64
-      %55 = llvm.mul %44, %20  : i64
-      %56 = llvm.add %54, %55  : i64
-      %57 = llvm.add %56, %34  : i64
-      %58 = llvm.getelementptr %arg1[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      %59 = llvm.load %58 : !llvm.ptr -> f16
-      %60 = llvm.intr.maxnum(%59, %17)  : (f16, f16) -> f16
-      %61 = llvm.getelementptr %arg12[%57] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %60, %61 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown1(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %26 = llvm.mul %23, %21  : i64
+      %27 = llvm.add %25, %26  : i64
+      %28 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %29 = llvm.sext %28 : i32 to i64
+      %30 = llvm.mul %23, %29  : i64
+      llvm.br ^bb1(%27 : i64)
+    ^bb1(%31: i64):  // 2 preds: ^bb0, ^bb2
+      %32 = llvm.icmp "slt" %31, %17 : i64
+      llvm.cond_br %32, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %33 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %34 = llvm.insertvalue %31, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %35 = llvm.mlir.constant(1 : index) : i64
+      %36 = llvm.insertvalue %35, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %17, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %35, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(12544 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %35, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(112 : index) : i64
+      %43 = llvm.getelementptr %arg1[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %44 = llvm.mul %19, %17  : i64
+      %45 = llvm.mul %19, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %19, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %19  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %51 = llvm.load %50 : !llvm.ptr -> f16
+      %52 = llvm.intr.maximum(%51, %18)  : (f16, f16) -> f16
+      %53 = llvm.insertvalue %31, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %35, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %17, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %35, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %35, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%31] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %31, %30  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown1(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -4287,68 +1753,58 @@ module attributes {byre.container_module, gpu.container_module} {
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %17 = llvm.mlir.constant(0 : index) : i64
       %18 = llvm.mlir.constant(9408 : index) : i64
-      %19 = llvm.mlir.constant(7 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = llvm.mlir.constant(3 : index) : i64
-      %22 = nvvm.read.ptx.sreg.ctaid.x : i32
-      %23 = llvm.sext %22 : i32 to i64
-      %24 = nvvm.read.ptx.sreg.ntid.x : i32
-      %25 = llvm.sext %24 : i32 to i64
-      %26 = nvvm.read.ptx.sreg.tid.x : i32
-      %27 = llvm.sext %26 : i32 to i64
-      %28 = llvm.mul %25, %23  : i64
-      %29 = llvm.add %27, %28  : i64
-      %30 = llvm.icmp "slt" %29, %18 : i64
-      llvm.cond_br %30, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %31 = llvm.srem %29, %19  : i64
-      %32 = llvm.icmp "slt" %31, %17 : i64
-      %33 = llvm.add %31, %19  : i64
-      %34 = llvm.select %32, %33, %31 : i1, i64
-      %35 = llvm.icmp "slt" %29, %17 : i64
-      %36 = llvm.sub %20, %29  : i64
-      %37 = llvm.select %35, %36, %29 : i1, i64
-      %38 = llvm.sdiv %37, %19  : i64
-      %39 = llvm.sub %20, %38  : i64
-      %40 = llvm.select %35, %39, %38 : i1, i64
-      %41 = llvm.srem %40, %19  : i64
-      %42 = llvm.icmp "slt" %41, %17 : i64
-      %43 = llvm.add %41, %19  : i64
-      %44 = llvm.select %42, %43, %41 : i1, i64
-      %45 = llvm.icmp "slt" %40, %17 : i64
-      %46 = llvm.sub %20, %40  : i64
-      %47 = llvm.select %45, %46, %40 : i1, i64
-      %48 = llvm.sdiv %47, %19  : i64
-      %49 = llvm.sub %20, %48  : i64
-      %50 = llvm.select %45, %49, %48 : i1, i64
-      %51 = llvm.srem %50, %21  : i64
-      %52 = llvm.icmp "slt" %51, %17 : i64
-      %53 = llvm.add %51, %21  : i64
-      %54 = llvm.select %52, %53, %51 : i1, i64
-      %55 = llvm.icmp "slt" %50, %17 : i64
-      %56 = llvm.sub %20, %50  : i64
-      %57 = llvm.select %55, %56, %50 : i1, i64
-      %58 = llvm.sdiv %57, %21  : i64
-      %59 = llvm.sub %20, %58  : i64
-      %60 = llvm.select %55, %59, %58 : i1, i64
-      %61 = llvm.mlir.constant(147 : index) : i64
-      %62 = llvm.mul %60, %61  : i64
-      %63 = llvm.mlir.constant(49 : index) : i64
-      %64 = llvm.mul %54, %63  : i64
-      %65 = llvm.add %62, %64  : i64
-      %66 = llvm.mul %44, %19  : i64
-      %67 = llvm.add %65, %66  : i64
-      %68 = llvm.add %67, %34  : i64
-      %69 = llvm.getelementptr %arg1[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %70 = llvm.load %69 : !llvm.ptr -> f32
-      %71 = llvm.fptrunc %70 : f32 to f16
-      %72 = llvm.getelementptr %arg12[%68] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %71, %72 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
-      llvm.return
-    }
-    llvm.func @Unknown0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr, %arg12: !llvm.ptr, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
+      %22 = llvm.sext %21 : i32 to i64
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
+      %24 = llvm.sext %23 : i32 to i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
+      %31 = llvm.icmp "slt" %30, %18 : i64
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.mlir.constant(147 : index) : i64
+      %37 = llvm.insertvalue %36, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.insertvalue %34, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %39 = llvm.mlir.constant(49 : index) : i64
+      %40 = llvm.insertvalue %39, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.insertvalue %34, %40[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %42 = llvm.mlir.constant(7 : index) : i64
+      %43 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %44 = llvm.mul %17, %36  : i64
+      %45 = llvm.mul %17, %39  : i64
+      %46 = llvm.add %44, %45  : i64
+      %47 = llvm.mul %17, %42  : i64
+      %48 = llvm.add %46, %47  : i64
+      %49 = llvm.add %48, %17  : i64
+      %50 = llvm.getelementptr %43[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %51 = llvm.load %50 : !llvm.ptr -> f32
+      %52 = llvm.fptrunc %51 : f32 to f16
+      %53 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %34, %53[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %36, %54[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %34, %55[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %39, %56[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.insertvalue %34, %57[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %59 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %60 = llvm.getelementptr %59[%49] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %52, %60 : f16, !llvm.ptr
+      %61 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%61 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.func @Unknown0(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: i64, %arg11: !llvm.ptr {llvm.noalias}, %arg12: !llvm.ptr {llvm.noalias}, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: i64, %arg19: i64, %arg20: i64, %arg21: i64) attributes {gpu.kernel, nvvm.kernel} {
       %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
       %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
@@ -4366,55 +1822,238 @@ module attributes {byre.container_module, gpu.container_module} {
       %14 = llvm.insertvalue %arg15, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %15 = llvm.insertvalue %arg19, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
       %16 = llvm.insertvalue %arg16, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
-      %17 = llvm.mlir.constant(0 : index) : i64
-      %18 = llvm.mlir.constant(150528 : index) : i64
-      %19 = llvm.mlir.constant(224 : index) : i64
-      %20 = llvm.mlir.constant(-1 : index) : i64
-      %21 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %17 = llvm.mlir.constant(150528 : index) : i64
+      %18 = llvm.mlir.constant(0 : index) : i64
+      %19 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %20 = llvm.sext %19 : i32 to i64
+      %21 = nvvm.read.ptx.sreg.ntid.x : i32
       %22 = llvm.sext %21 : i32 to i64
-      %23 = nvvm.read.ptx.sreg.ntid.x : i32
+      %23 = nvvm.read.ptx.sreg.tid.x : i32
       %24 = llvm.sext %23 : i32 to i64
-      %25 = nvvm.read.ptx.sreg.tid.x : i32
-      %26 = llvm.sext %25 : i32 to i64
-      %27 = llvm.mul %24, %22  : i64
-      %28 = llvm.add %26, %27  : i64
-      %29 = llvm.icmp "slt" %28, %18 : i64
-      llvm.cond_br %29, ^bb1, ^bb2
-    ^bb1:  // pred: ^bb0
-      %30 = llvm.srem %28, %19  : i64
+      %25 = llvm.mul %22, %20  : i64
+      %26 = llvm.add %24, %25  : i64
+      %27 = nvvm.read.ptx.sreg.nctaid.x : i32
+      %28 = llvm.sext %27 : i32 to i64
+      %29 = llvm.mul %22, %28  : i64
+      llvm.br ^bb1(%26 : i64)
+    ^bb1(%30: i64):  // 2 preds: ^bb0, ^bb2
       %31 = llvm.icmp "slt" %30, %17 : i64
-      %32 = llvm.add %30, %19  : i64
-      %33 = llvm.select %31, %32, %30 : i1, i64
-      %34 = llvm.icmp "slt" %28, %17 : i64
-      %35 = llvm.sub %20, %28  : i64
-      %36 = llvm.select %34, %35, %28 : i1, i64
-      %37 = llvm.sdiv %36, %19  : i64
-      %38 = llvm.sub %20, %37  : i64
-      %39 = llvm.select %34, %38, %37 : i1, i64
-      %40 = llvm.srem %39, %19  : i64
-      %41 = llvm.icmp "slt" %40, %17 : i64
-      %42 = llvm.add %40, %19  : i64
-      %43 = llvm.select %41, %42, %40 : i1, i64
-      %44 = llvm.icmp "slt" %39, %17 : i64
-      %45 = llvm.sub %20, %39  : i64
-      %46 = llvm.select %44, %45, %39 : i1, i64
-      %47 = llvm.sdiv %46, %19  : i64
-      %48 = llvm.sub %20, %47  : i64
-      %49 = llvm.select %44, %48, %47 : i1, i64
-      %50 = llvm.mul %17, %18  : i64
-      %51 = llvm.mlir.constant(50176 : index) : i64
-      %52 = llvm.mul %49, %51  : i64
-      %53 = llvm.add %50, %52  : i64
-      %54 = llvm.mul %43, %19  : i64
-      %55 = llvm.add %53, %54  : i64
-      %56 = llvm.add %55, %33  : i64
-      %57 = llvm.getelementptr %arg1[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-      %58 = llvm.load %57 : !llvm.ptr -> f32
-      %59 = llvm.fptrunc %58 : f32 to f16
-      %60 = llvm.getelementptr %arg12[%56] : (!llvm.ptr, i64) -> !llvm.ptr, f16
-      llvm.store %59, %60 : f16, !llvm.ptr
-      llvm.br ^bb2
-    ^bb2:  // 2 preds: ^bb0, ^bb1
+      llvm.cond_br %31, ^bb2, ^bb3
+    ^bb2:  // pred: ^bb1
+      %32 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %33 = llvm.insertvalue %30, %2[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %34 = llvm.mlir.constant(1 : index) : i64
+      %35 = llvm.insertvalue %34, %33[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %36 = llvm.insertvalue %17, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %37 = llvm.insertvalue %34, %36[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %38 = llvm.mlir.constant(50176 : index) : i64
+      %39 = llvm.insertvalue %38, %37[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %40 = llvm.insertvalue %34, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %41 = llvm.mlir.constant(224 : index) : i64
+      %42 = llvm.getelementptr %arg1[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %43 = llvm.mul %18, %17  : i64
+      %44 = llvm.mul %18, %38  : i64
+      %45 = llvm.add %43, %44  : i64
+      %46 = llvm.mul %18, %41  : i64
+      %47 = llvm.add %45, %46  : i64
+      %48 = llvm.add %47, %18  : i64
+      %49 = llvm.getelementptr %42[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %50 = llvm.load %49 : !llvm.ptr -> f32
+      %51 = llvm.fptrunc %50 : f32 to f16
+      %52 = llvm.insertvalue %30, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %53 = llvm.insertvalue %34, %52[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %54 = llvm.insertvalue %17, %53[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %55 = llvm.insertvalue %34, %54[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %56 = llvm.insertvalue %38, %55[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %57 = llvm.insertvalue %34, %56[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> 
+      %58 = llvm.getelementptr %arg12[%30] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %59 = llvm.getelementptr %58[%48] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %51, %59 : f16, !llvm.ptr
+      %60 = llvm.add %30, %29  : i64
+      llvm.br ^bb1(%60 : i64)
+    ^bb3:  // pred: ^bb1
+      llvm.return
+    }
+    llvm.mlir.global internal @__wg_Unknown58_kernel_0() {addr_space = 3 : i32} : !llvm.array<64 x f16>
+    llvm.mlir.global internal @__wg_Unknown58_kernel_1() {addr_space = 3 : i32} : !llvm.array<32 x f16>
+    llvm.mlir.global internal @__wg_Unknown58_kernel_2() {addr_space = 3 : i32} : !llvm.array<16 x f16>
+    llvm.mlir.global internal @__wg_Unknown58_kernel_3() {addr_space = 3 : i32} : !llvm.array<8 x f16>
+    llvm.mlir.global internal @__wg_Unknown58_kernel_4() {addr_space = 3 : i32} : !llvm.array<4 x f16>
+    llvm.mlir.global internal @__wg_Unknown58_kernel_5() {addr_space = 3 : i32} : !llvm.array<2 x f16>
+    llvm.func @Unknown58_kernel(%arg0: !llvm.ptr {llvm.noalias}, %arg1: !llvm.ptr {llvm.noalias}, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr {llvm.noalias}, %arg8: !llvm.ptr {llvm.noalias}, %arg9: i64, %arg10: i64, %arg11: i64) attributes {gpu.kernel, gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 512, 1, 1>, nvvm.kernel} {
+      %0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
+      %1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %5 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+      %6 = llvm.insertvalue %arg7, %5[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+      %7 = llvm.insertvalue %arg8, %6[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+      %8 = llvm.mlir.addressof @__wg_Unknown58_kernel_0 : !llvm.ptr<3>
+      %9 = llvm.getelementptr %8[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<64 x f16>
+      %10 = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
+      %11 = llvm.insertvalue %9, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %12 = llvm.insertvalue %9, %11[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %13 = llvm.mlir.constant(0 : index) : i64
+      %14 = llvm.mlir.constant(64 : index) : i64
+      %15 = llvm.mlir.constant(1 : index) : i64
+      %16 = llvm.mlir.addressof @__wg_Unknown58_kernel_1 : !llvm.ptr<3>
+      %17 = llvm.getelementptr %16[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<32 x f16>
+      %18 = llvm.insertvalue %17, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %19 = llvm.insertvalue %17, %18[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %20 = llvm.mlir.constant(32 : index) : i64
+      %21 = llvm.mlir.addressof @__wg_Unknown58_kernel_2 : !llvm.ptr<3>
+      %22 = llvm.getelementptr %21[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<16 x f16>
+      %23 = llvm.insertvalue %22, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %24 = llvm.insertvalue %22, %23[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %25 = llvm.mlir.constant(16 : index) : i64
+      %26 = llvm.mlir.addressof @__wg_Unknown58_kernel_3 : !llvm.ptr<3>
+      %27 = llvm.getelementptr %26[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<8 x f16>
+      %28 = llvm.insertvalue %27, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %29 = llvm.insertvalue %27, %28[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %30 = llvm.mlir.constant(8 : index) : i64
+      %31 = llvm.mlir.addressof @__wg_Unknown58_kernel_4 : !llvm.ptr<3>
+      %32 = llvm.getelementptr %31[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x f16>
+      %33 = llvm.insertvalue %32, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %34 = llvm.insertvalue %32, %33[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %35 = llvm.mlir.constant(4 : index) : i64
+      %36 = llvm.mlir.addressof @__wg_Unknown58_kernel_5 : !llvm.ptr<3>
+      %37 = llvm.getelementptr %36[0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<2 x f16>
+      %38 = llvm.insertvalue %37, %10[0] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %39 = llvm.insertvalue %37, %38[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> 
+      %40 = llvm.mlir.constant(2 : index) : i64
+      %41 = llvm.mlir.constant(0.000000e+00 : f16) : f16
+      %42 = llvm.mlir.constant(49 : index) : i64
+      %43 = nvvm.read.ptx.sreg.ctaid.x : i32
+      %44 = llvm.sext %43 : i32 to i64
+      %45 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64)>
+      %46 = llvm.mul %44, %42  : i64
+      %47 = nvvm.read.ptx.sreg.tid.x : i32
+      %48 = llvm.sext %47 : i32 to i64
+      %49 = llvm.srem %48, %14  : i64
+      %50 = llvm.icmp "slt" %49, %13 : i64
+      %51 = llvm.add %49, %14  : i64
+      %52 = llvm.select %50, %51, %49 : i1, i64
+      %53 = llvm.icmp "slt" %52, %42 : i64
+      %54 = llvm.select %53, %52, %42 : i1, i64
+      %55 = llvm.add %52, %15  : i64
+      %56 = llvm.icmp "slt" %55, %42 : i64
+      %57 = llvm.select %56, %55, %42 : i1, i64
+      %58 = llvm.sub %57, %54  : i64
+      %59 = llvm.add %46, %54  : i64
+      %60 = llvm.insertvalue %59, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %61 = llvm.insertvalue %15, %60[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
+      %62 = llvm.icmp "ugt" %58, %13 : i64
+      llvm.cond_br %62, ^bb1, ^bb2(%41 : f16)
+    ^bb1:  // pred: ^bb0
+      %63 = llvm.getelementptr %arg1[%59] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %64 = llvm.mul %58, %13  : i64
+      %65 = llvm.add %64, %13  : i64
+      %66 = llvm.getelementptr %63[%65] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      %67 = llvm.load %66 : !llvm.ptr -> f16
+      llvm.br ^bb2(%67 : f16)
+    ^bb2(%68: f16):  // 2 preds: ^bb0, ^bb1
+      %69 = llvm.fadd %68, %41  : f16
+      %70 = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64)>
+      %71 = llvm.getelementptr %9[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %69, %71 : f16, !llvm.ptr<3>
+      nvvm.barrier0
+      %72 = llvm.icmp "ult" %48, %20 : i64
+      llvm.cond_br %72, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %73 = llvm.mul %48, %40  : i64
+      %74 = llvm.getelementptr %9[%73] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %75 = llvm.load %74 : !llvm.ptr<3> -> f16
+      %76 = llvm.fadd %75, %41  : f16
+      %77 = llvm.add %73, %15  : i64
+      %78 = llvm.getelementptr %9[%77] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %79 = llvm.load %78 : !llvm.ptr<3> -> f16
+      %80 = llvm.fadd %79, %76  : f16
+      %81 = llvm.getelementptr %17[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %80, %81 : f16, !llvm.ptr<3>
+      llvm.br ^bb4
+    ^bb4:  // 2 preds: ^bb2, ^bb3
+      nvvm.barrier0
+      %82 = llvm.icmp "ult" %48, %25 : i64
+      llvm.cond_br %82, ^bb5, ^bb6
+    ^bb5:  // pred: ^bb4
+      %83 = llvm.mul %48, %40  : i64
+      %84 = llvm.getelementptr %17[%83] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %85 = llvm.load %84 : !llvm.ptr<3> -> f16
+      %86 = llvm.fadd %85, %41  : f16
+      %87 = llvm.add %83, %15  : i64
+      %88 = llvm.getelementptr %17[%87] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %89 = llvm.load %88 : !llvm.ptr<3> -> f16
+      %90 = llvm.fadd %89, %86  : f16
+      %91 = llvm.getelementptr %22[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %90, %91 : f16, !llvm.ptr<3>
+      llvm.br ^bb6
+    ^bb6:  // 2 preds: ^bb4, ^bb5
+      nvvm.barrier0
+      %92 = llvm.icmp "ult" %48, %30 : i64
+      llvm.cond_br %92, ^bb7, ^bb8
+    ^bb7:  // pred: ^bb6
+      %93 = llvm.mul %48, %40  : i64
+      %94 = llvm.getelementptr %22[%93] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %95 = llvm.load %94 : !llvm.ptr<3> -> f16
+      %96 = llvm.fadd %95, %41  : f16
+      %97 = llvm.add %93, %15  : i64
+      %98 = llvm.getelementptr %22[%97] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %99 = llvm.load %98 : !llvm.ptr<3> -> f16
+      %100 = llvm.fadd %99, %96  : f16
+      %101 = llvm.getelementptr %27[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %100, %101 : f16, !llvm.ptr<3>
+      llvm.br ^bb8
+    ^bb8:  // 2 preds: ^bb6, ^bb7
+      nvvm.barrier0
+      %102 = llvm.icmp "ult" %48, %35 : i64
+      llvm.cond_br %102, ^bb9, ^bb10
+    ^bb9:  // pred: ^bb8
+      %103 = llvm.mul %48, %40  : i64
+      %104 = llvm.getelementptr %27[%103] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %105 = llvm.load %104 : !llvm.ptr<3> -> f16
+      %106 = llvm.fadd %105, %41  : f16
+      %107 = llvm.add %103, %15  : i64
+      %108 = llvm.getelementptr %27[%107] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %109 = llvm.load %108 : !llvm.ptr<3> -> f16
+      %110 = llvm.fadd %109, %106  : f16
+      %111 = llvm.getelementptr %32[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %110, %111 : f16, !llvm.ptr<3>
+      llvm.br ^bb10
+    ^bb10:  // 2 preds: ^bb8, ^bb9
+      nvvm.barrier0
+      %112 = llvm.icmp "ult" %48, %40 : i64
+      llvm.cond_br %112, ^bb11, ^bb12
+    ^bb11:  // pred: ^bb10
+      %113 = llvm.mul %48, %40  : i64
+      %114 = llvm.getelementptr %32[%113] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %115 = llvm.load %114 : !llvm.ptr<3> -> f16
+      %116 = llvm.fadd %115, %41  : f16
+      %117 = llvm.add %113, %15  : i64
+      %118 = llvm.getelementptr %32[%117] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %119 = llvm.load %118 : !llvm.ptr<3> -> f16
+      %120 = llvm.fadd %119, %116  : f16
+      %121 = llvm.getelementptr %37[%48] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      llvm.store %120, %121 : f16, !llvm.ptr<3>
+      llvm.br ^bb12
+    ^bb12:  // 2 preds: ^bb10, ^bb11
+      nvvm.barrier0
+      %122 = llvm.icmp "ult" %48, %15 : i64
+      llvm.cond_br %122, ^bb13, ^bb14
+    ^bb13:  // pred: ^bb12
+      %123 = llvm.mul %48, %40  : i64
+      %124 = llvm.getelementptr %37[%123] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %125 = llvm.load %124 : !llvm.ptr<3> -> f16
+      %126 = llvm.fadd %125, %41  : f16
+      %127 = llvm.add %123, %15  : i64
+      %128 = llvm.getelementptr %37[%127] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+      %129 = llvm.load %128 : !llvm.ptr<3> -> f16
+      %130 = llvm.fadd %129, %126  : f16
+      %131 = llvm.getelementptr %arg8[%44] : (!llvm.ptr, i64) -> !llvm.ptr, f16
+      llvm.store %130, %131 : f16, !llvm.ptr
+      llvm.br ^bb14
+    ^bb14:  // 2 preds: ^bb12, ^bb13
+      nvvm.barrier0
       llvm.return
     }
   }
diff --git a/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir
index 04d163e36..d4507bd46 100644
--- a/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/2_linalg_tensor_opt.mlir
@@ -37,53 +37,12 @@ module {
     %1 = mhlo.maximum %arg0, %0 : tensor<1x64x56x56xf16>
     return %1 : tensor<1x64x56x56xf16>
   }
-  func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp8(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
   func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16>
     %2 = mhlo.maximum %1, %0 : tensor<1x64x56x56xf16>
     return %2 : tensor<1x64x56x56xf16>
   }
-  func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp11(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp14(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x64x56x56xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x64x56x56xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<1x64x56x56xf16>
-    return %2 : tensor<1x64x56x56xf16>
-  }
   func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
     return %0 : tensor<128x64x1x1xf16>
@@ -98,12 +57,6 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
     return %0 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp19(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<1x128x28x28xf16>
@@ -113,49 +66,12 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
     return %0 : tensor<128x128x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp22(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16>
     %2 = mhlo.maximum %1, %0 : tensor<1x128x28x28xf16>
     return %2 : tensor<1x128x28x28xf16>
   }
-  func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    return %0 : tensor<128x128x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp25(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    return %0 : tensor<128x128x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp28(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x128x28x28xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x128x28x28xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<1x128x28x28xf16>
-    return %2 : tensor<1x128x28x28xf16>
-  }
   func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
     return %0 : tensor<256x128x1x1xf16>
@@ -170,12 +86,6 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
     return %0 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<1x256x14x14xf16>
@@ -185,49 +95,12 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
     return %0 : tensor<256x256x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16>
     %2 = mhlo.maximum %1, %0 : tensor<1x256x14x14xf16>
     return %2 : tensor<1x256x14x14xf16>
   }
-  func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    return %0 : tensor<256x256x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp39(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    return %0 : tensor<256x256x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp42(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x256x14x14xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x256x14x14xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<1x256x14x14xf16>
-    return %2 : tensor<1x256x14x14xf16>
-  }
   func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
     return %0 : tensor<512x256x1x1xf16>
@@ -242,12 +115,6 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
     return %0 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp47(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<1x512x7x7xf16>
@@ -257,72 +124,36 @@ module {
     %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
     return %0 : tensor<512x512x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp50(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16>
     %2 = mhlo.maximum %1, %0 : tensor<1x512x7x7xf16>
     return %2 : tensor<1x512x7x7xf16>
   }
-  func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    return %0 : tensor<512x512x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp53(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    return %0 : tensor<512x512x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp56(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<1x512x7x7xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<1x512x7x7xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<1x512x7x7xf16>
-    return %2 : tensor<1x512x7x7xf16>
+  func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor<f16>) -> tensor<1x512xf16>
+     reducer(%arg1: tensor<f16>, %arg2: tensor<f16>)  {
+      %2 = mhlo.add %arg1, %arg2 : tensor<f16>
+      mhlo.return %2 : tensor<f16>
+    }
+    return %1 : tensor<1x512xf16>
   }
-  func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<2.040100e-02> : tensor<1x512xf16>
     %1 = mhlo.multiply %arg0, %0 : tensor<1x512xf16>
     return %1 : tensor<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
     return %0 : tensor<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16>
     %1 = mhlo.reshape %0 : (tensor<1000xf16>) -> tensor<1x1000xf16>
     %2 = mhlo.add %arg1, %1 : tensor<1x1000xf16>
     return %2 : tensor<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
   func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
     %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
@@ -331,78 +162,6 @@ module {
     %4 = mhlo.add %2, %3 : tensor<64xf32>
     return %4 : tensor<64xf32>
   }
-  func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<64xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<64xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<64xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<64xf32>
-    %4 = mhlo.add %2, %3 : tensor<64xf32>
-    return %4 : tensor<64xf32>
-  }
-  func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
   func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
     %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
@@ -411,78 +170,6 @@ module {
     %4 = mhlo.add %2, %3 : tensor<128xf32>
     return %4 : tensor<128xf32>
   }
-  func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<128xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<128xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<128xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<128xf32>
-    %4 = mhlo.add %2, %3 : tensor<128xf32>
-    return %4 : tensor<128xf32>
-  }
-  func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
   func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
     %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
@@ -491,78 +178,6 @@ module {
     %4 = mhlo.add %2, %3 : tensor<256xf32>
     return %4 : tensor<256xf32>
   }
-  func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<256xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<256xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<256xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<256xf32>
-    %4 = mhlo.add %2, %3 : tensor<256xf32>
-    return %4 : tensor<256xf32>
-  }
-  func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
   func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
     %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
@@ -571,206 +186,137 @@ module {
     %4 = mhlo.add %2, %3 : tensor<512xf32>
     return %4 : tensor<512xf32>
   }
-  func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
-  func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.899999976> : tensor<512xf32>
-    %1 = mhlo.constant dense<1.000000e-01> : tensor<512xf32>
-    %2 = mhlo.multiply %arg0, %1 : tensor<512xf32>
-    %3 = mhlo.multiply %arg1, %0 : tensor<512xf32>
-    %4 = mhlo.add %2, %3 : tensor<512xf32>
-    return %4 : tensor<512xf32>
-  }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor<i64>, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor<i64>, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor<i64>, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor<i64>, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor<i64>, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor<i64>, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor<i64>, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor<i64>, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor<i64>, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor<i64>, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor<i64>, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor<i64>, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor<i64>, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor<i64>, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor<i64>, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor<i64>, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor<i64>, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor<i64>, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor<i64>, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor<i64>, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %1 = mhlo.constant dense<0xFC00> : tensor<f16>
-    %2 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16>
-    %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
-    %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16>
-    %5:3 = call @BatchNormTrainingOp2(%4, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %6 = call @Unknown3(%5#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
-    %7 = "mhlo.reduce_window"(%6, %1) ({
+    %0 = mhlo.constant dense<0xFC00> : tensor<f16>
+    %1 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16>
+    %2 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
+    %3 = mhlo.convolution(%1, %2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16>
+    %4:3 = call @BatchNormTrainingOp2(%3, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %5 = call @Unknown3(%4#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
+    %6 = "mhlo.reduce_window"(%5, %0) ({
     ^bb0(%arg123: tensor<f16>, %arg124: tensor<f16>):
-      %127 = mhlo.maximum %arg123, %arg124 : tensor<f16>
-      mhlo.return %127 : tensor<f16>
+      %126 = mhlo.maximum %arg123, %arg124 : tensor<f16>
+      mhlo.return %126 : tensor<f16>
     }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<f16>) -> tensor<1x64x56x56xf16>
-    %8 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %9 = mhlo.convolution(%7, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %10:3 = call @BatchNormTrainingOp5(%9, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %11 = call @Unknown6(%10#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %12 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %13 = mhlo.convolution(%11, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %14:3 = call @BatchNormTrainingOp8(%13, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %15 = call @Unknown9(%14#0, %7) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %16 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %17 = mhlo.convolution(%15, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %18:3 = call @BatchNormTrainingOp11(%17, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %19 = call @Unknown12(%18#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %20 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %21 = mhlo.convolution(%19, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %22:3 = call @BatchNormTrainingOp14(%21, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %23 = call @Unknown15(%22#0, %15) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %24 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
-    %25 = mhlo.convolution(%23, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16>
-    %26:3 = call @BatchNormTrainingOp17(%25, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %27 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
-    %28 = mhlo.convolution(%23, %27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %29:3 = call @BatchNormTrainingOp19(%28, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %30 = call @Unknown20(%29#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %31 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %32 = mhlo.convolution(%30, %31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %33:3 = call @BatchNormTrainingOp22(%32, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %34 = call @Unknown23(%33#0, %26#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %35 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %36 = mhlo.convolution(%34, %35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %37:3 = call @BatchNormTrainingOp25(%36, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %38 = call @Unknown26(%37#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %39 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %40 = mhlo.convolution(%38, %39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %41:3 = call @BatchNormTrainingOp28(%40, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %42 = call @Unknown29(%41#0, %34) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %43 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
-    %44 = mhlo.convolution(%42, %43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16>
-    %45:3 = call @BatchNormTrainingOp31(%44, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %46 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
-    %47 = mhlo.convolution(%42, %46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %48:3 = call @BatchNormTrainingOp33(%47, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %49 = call @Unknown34(%48#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %50 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %51 = mhlo.convolution(%49, %50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %52:3 = call @BatchNormTrainingOp36(%51, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %53 = call @Unknown37(%52#0, %45#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %54 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %55 = mhlo.convolution(%53, %54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %56:3 = call @BatchNormTrainingOp39(%55, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %57 = call @Unknown40(%56#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %58 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %59 = mhlo.convolution(%57, %58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %60:3 = call @BatchNormTrainingOp42(%59, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %61 = call @Unknown43(%60#0, %53) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %62 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
-    %63 = mhlo.convolution(%61, %62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16>
-    %64:3 = call @BatchNormTrainingOp45(%63, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %65 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
-    %66 = mhlo.convolution(%61, %65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %67:3 = call @BatchNormTrainingOp47(%66, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %68 = call @Unknown48(%67#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %69 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %70 = mhlo.convolution(%68, %69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %71:3 = call @BatchNormTrainingOp50(%70, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %72 = call @Unknown51(%71#0, %64#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %73 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %74 = mhlo.convolution(%72, %73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %75:3 = call @BatchNormTrainingOp53(%74, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %76 = call @Unknown54(%75#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %77 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %78 = mhlo.convolution(%76, %77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %79:3 = call @BatchNormTrainingOp56(%78, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %80 = call @Unknown57(%79#0, %72) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %81 = mhlo.reduce(%80 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor<f16>) -> tensor<1x512xf16>
-     reducer(%arg123: tensor<f16>, %arg124: tensor<f16>)  {
-      %127 = mhlo.add %arg123, %arg124 : tensor<f16>
-      mhlo.return %127 : tensor<f16>
-    }
-    %82 = call @Unknown58(%81) : (tensor<1x512xf16>) -> tensor<1x512xf16>
-    %83 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %84 = "mhlo.transpose"(%83) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16>
-    %85 = "mhlo.dot_general"(%82, %83) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16>
-    %86 = call @Unknown60(%arg3, %85) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
-    %87 = call @Unknown61(%5#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %88 = call @Unknown62(%5#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %89 = call @Unknown63(%10#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %90 = call @Unknown64(%10#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %91 = call @Unknown65(%14#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %92 = call @Unknown66(%14#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %93 = call @Unknown67(%18#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %94 = call @Unknown68(%18#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %95 = call @Unknown69(%22#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %96 = call @Unknown70(%22#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %97 = call @Unknown71(%29#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %98 = call @Unknown72(%29#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %99 = call @Unknown73(%33#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %100 = call @Unknown74(%33#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %101 = call @Unknown75(%26#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %102 = call @Unknown76(%26#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %103 = call @Unknown77(%37#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %104 = call @Unknown78(%37#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %105 = call @Unknown79(%41#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %106 = call @Unknown80(%41#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %107 = call @Unknown81(%48#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %108 = call @Unknown82(%48#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %109 = call @Unknown83(%52#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %110 = call @Unknown84(%52#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %111 = call @Unknown85(%45#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %112 = call @Unknown86(%45#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %113 = call @Unknown87(%56#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %114 = call @Unknown88(%56#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %115 = call @Unknown89(%60#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %116 = call @Unknown90(%60#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %117 = call @Unknown91(%67#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %118 = call @Unknown92(%67#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %119 = call @Unknown93(%71#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %120 = call @Unknown94(%71#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %121 = call @Unknown95(%64#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %122 = call @Unknown96(%64#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %123 = call @Unknown97(%75#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %124 = call @Unknown98(%75#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %125 = call @Unknown99(%79#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %126 = call @Unknown100(%79#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    return %86, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %3, %2, %4, %6, %7, %8, %9, %11, %12, %13, %15, %16, %17, %19, %20, %21, %23, %27, %28, %30, %31, %32, %24, %25, %34, %35, %36, %38, %39, %40, %42, %46, %47, %49, %50, %51, %43, %44, %53, %54, %55, %57, %58, %59, %61, %65, %66, %68, %69, %70, %62, %63, %72, %73, %74, %76, %77, %78, %80, %82, %84 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
+    %7 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %8 = mhlo.convolution(%6, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %9:3 = call @BatchNormTrainingOp5(%8, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %10 = call @Unknown6(%9#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %11 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %12 = mhlo.convolution(%10, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %13:3 = call @BatchNormTrainingOp5(%12, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %14 = call @Unknown9(%13#0, %6) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %15 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %16 = mhlo.convolution(%14, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %17:3 = call @BatchNormTrainingOp5(%16, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %19 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %20 = mhlo.convolution(%18, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %21:3 = call @BatchNormTrainingOp5(%20, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %22 = call @Unknown9(%21#0, %14) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %23 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
+    %24 = mhlo.convolution(%22, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16>
+    %25:3 = call @BatchNormTrainingOp17(%24, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %26 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
+    %27 = mhlo.convolution(%22, %26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %28:3 = call @BatchNormTrainingOp17(%27, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %29 = call @Unknown20(%28#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %30 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %31 = mhlo.convolution(%29, %30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %32:3 = call @BatchNormTrainingOp17(%31, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %33 = call @Unknown23(%32#0, %25#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %34 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %35 = mhlo.convolution(%33, %34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %36:3 = call @BatchNormTrainingOp17(%35, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %37 = call @Unknown20(%36#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %38 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %39 = mhlo.convolution(%37, %38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %40:3 = call @BatchNormTrainingOp17(%39, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %41 = call @Unknown23(%40#0, %33) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %42 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
+    %43 = mhlo.convolution(%41, %42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16>
+    %44:3 = call @BatchNormTrainingOp31(%43, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %45 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
+    %46 = mhlo.convolution(%41, %45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %47:3 = call @BatchNormTrainingOp31(%46, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %48 = call @Unknown34(%47#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %49 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %50 = mhlo.convolution(%48, %49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %51:3 = call @BatchNormTrainingOp31(%50, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %52 = call @Unknown37(%51#0, %44#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %53 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %54 = mhlo.convolution(%52, %53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %55:3 = call @BatchNormTrainingOp31(%54, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %56 = call @Unknown34(%55#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %57 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %58 = mhlo.convolution(%56, %57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %59:3 = call @BatchNormTrainingOp31(%58, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %60 = call @Unknown37(%59#0, %52) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %61 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
+    %62 = mhlo.convolution(%60, %61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16>
+    %63:3 = call @BatchNormTrainingOp45(%62, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %64 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
+    %65 = mhlo.convolution(%60, %64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %66:3 = call @BatchNormTrainingOp45(%65, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %67 = call @Unknown48(%66#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %68 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %69 = mhlo.convolution(%67, %68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %70:3 = call @BatchNormTrainingOp45(%69, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %71 = call @Unknown51(%70#0, %63#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %72 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %73 = mhlo.convolution(%71, %72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %74:3 = call @BatchNormTrainingOp45(%73, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %75 = call @Unknown48(%74#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %76 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %77 = mhlo.convolution(%75, %76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %78:3 = call @BatchNormTrainingOp45(%77, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %79 = call @Unknown51(%78#0, %71) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %80 = call @Unknown58(%79) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16>
+    %81 = call @Unknown59(%80) : (tensor<1x512xf16>) -> tensor<1x512xf16>
+    %82 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
+    %83 = "mhlo.transpose"(%82) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16>
+    %84 = "mhlo.dot_general"(%81, %82) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16>
+    %85 = call @Unknown61(%arg3, %84) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
+    %86 = call @Unknown62(%4#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %87 = call @Unknown62(%4#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %88 = call @Unknown62(%9#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %89 = call @Unknown62(%9#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %90 = call @Unknown62(%13#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %91 = call @Unknown62(%13#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %92 = call @Unknown62(%17#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %93 = call @Unknown62(%17#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %94 = call @Unknown62(%21#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %95 = call @Unknown62(%21#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %96 = call @Unknown72(%28#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %97 = call @Unknown72(%28#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %98 = call @Unknown72(%32#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %99 = call @Unknown72(%32#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %100 = call @Unknown72(%25#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %101 = call @Unknown72(%25#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %102 = call @Unknown72(%36#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %103 = call @Unknown72(%36#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %104 = call @Unknown72(%40#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %105 = call @Unknown72(%40#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %106 = call @Unknown82(%47#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %107 = call @Unknown82(%47#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %108 = call @Unknown82(%51#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %109 = call @Unknown82(%51#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %110 = call @Unknown82(%44#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %111 = call @Unknown82(%44#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %112 = call @Unknown82(%55#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %113 = call @Unknown82(%55#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %114 = call @Unknown82(%59#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %115 = call @Unknown82(%59#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %116 = call @Unknown92(%66#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %117 = call @Unknown92(%66#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %118 = call @Unknown92(%70#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %119 = call @Unknown92(%70#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %120 = call @Unknown92(%63#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %121 = call @Unknown92(%63#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %122 = call @Unknown92(%74#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %123 = call @Unknown92(%74#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %124 = call @Unknown92(%78#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %125 = call @Unknown92(%78#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    return %85, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %2, %1, %3, %5, %6, %7, %8, %10, %11, %12, %14, %15, %16, %18, %19, %20, %22, %26, %27, %29, %30, %31, %23, %24, %33, %34, %35, %37, %38, %39, %41, %45, %46, %48, %49, %50, %42, %43, %52, %53, %54, %56, %57, %58, %60, %64, %65, %67, %68, %69, %61, %62, %71, %72, %73, %75, %76, %77, %79, %81, %83 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir
index 7eb0e9afa..2a075b46d 100644
--- a/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/3_byre_tensor_opt.mlir
@@ -2,26 +2,65 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
 module {
   func.func private @Unknown0(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c224 = arith.constant 224 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x3x224x224xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x3x224x224xf32>) outs(%0 : tensor<1x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1x3x224x224xf16>
+    %1 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %0) -> (tensor<1x3x224x224xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c224 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x3x224x224xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x3x224x224xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x3x224x224xf32> to tensor<f32>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f32, %out: f16):
+            %6 = arith.truncf %in : f32 to f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x3x224x224xf16>
+          scf.yield %inserted_slice : tensor<1x3x224x224xf16>
+        }
+        scf.yield %3 : tensor<1x3x224x224xf16>
+      }
+      scf.yield %2 : tensor<1x3x224x224xf16>
+    }
     return %1 : tensor<1x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x3x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x3x7x7xf16>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf16>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf16>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf16>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf16>
+    }
     return %1 : tensor<64x3x7x7xf16>
   }
   func.func private @BatchNormTrainingOp2(%arg0: tensor<1x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -31,22 +70,57 @@ module {
     return %1, %batch_mean, %batch_var : tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
   }
   func.func private @Unknown3(%arg0: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x112x112xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x112x112xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x112x112xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x112x112xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x112x112xf16>
+          scf.yield %inserted_slice : tensor<1x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<1x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<1x64x112x112xf16>
+    }
     return %1 : tensor<1x64x112x112xf16>
   }
   func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf16>
+    }
     return %1 : tensor<64x64x3x3xf16>
   }
   func.func private @BatchNormTrainingOp5(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -56,99 +130,79 @@ module {
     return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
   }
   func.func private @Unknown6(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c56 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
-  func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp8(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
   func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp11(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp14(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x1x1xf16>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf16>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf16>
+    }
     return %1 : tensor<128x64x1x1xf16>
   }
   func.func private @BatchNormTrainingOp17(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -158,114 +212,136 @@ module {
     return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
   }
   func.func private @Unknown18(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf16>
+    }
     return %1 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp19(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c28 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown21(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf16>
+    }
     return %1 : tensor<128x128x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp22(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
   func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp25(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp28(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x1x1xf16>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf16>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf16>
+    }
     return %1 : tensor<256x128x1x1xf16>
   }
   func.func private @BatchNormTrainingOp31(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -275,114 +351,136 @@ module {
     return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
   }
   func.func private @Unknown32(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf16>
+    }
     return %1 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp33(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c14 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown35(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf16>
+    }
     return %1 : tensor<256x256x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp36(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
   func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp39(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp42(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x1x1xf16>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf16>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf16>
+    }
     return %1 : tensor<512x256x1x1xf16>
   }
   func.func private @BatchNormTrainingOp45(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -392,793 +490,517 @@ module {
     return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
   }
   func.func private @Unknown46(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf16>
+    }
     return %1 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp47(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c7 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @Unknown49(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x512x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf16>
+    }
     return %1 : tensor<512x512x3x3xf16>
   }
-  func.func private @BatchNormTrainingOp50(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
   func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp53(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
-  func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @BatchNormTrainingOp56(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf16>
-    return %1, %batch_mean, %batch_var : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x512x7x7xf16> into tensor<512x49xf16>
+    %0 = tensor.empty() : tensor<512xf16>
+    %1 = scf.forall (%arg1) in (512) shared_outs(%arg2 = %0) -> (tensor<512xf16>) {
+      %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<512x49xf16> to tensor<49xf16>
+      %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16>
+      %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<512xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) {
+        %15 = affine.min #map1(%arg3)
+        %16 = affine.min #map2(%arg3)
+        %17 = affine.apply #map3(%16, %15)
+        %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor<?xf16>
+        %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %expanded_8, %c1 : tensor<1x?xf16>
+        %18 = arith.cmpi ugt, %dim, %c0 : index
+        %19 = scf.if %18 -> (f16) {
+          %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %20 = arith.addf %19, %cst : f16
+        %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor<f16>) {
+        %15 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %13[%15] : tensor<2xf16>
+        %16 = arith.addf %extracted, %cst : f16
+        %17 = affine.apply #map5(%arg3)
+        %extracted_7 = tensor.extract %13[%17] : tensor<2xf16>
+        %18 = arith.addf %extracted_7, %16 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[] [] [] : tensor<f16> to tensor<f16>
+        %inserted = tensor.insert %18 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<512xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<512xf16> into tensor<1x512xf16>
+    return %expanded : tensor<1x512xf16>
+  }
+  func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 2.040100e-02 : f16
     %0 = tensor.empty() : tensor<1x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x512xf16>) outs(%0 : tensor<1x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.mulf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x512xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f16, %out: f16):
+        %4 = arith.mulf %in, %cst : f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor<f16> into tensor<1x512xf16>
+      scf.yield %inserted_slice : tensor<1x512xf16>
+    }
     return %1 : tensor<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1000x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<1000x512xf16>
+        scf.yield %inserted_slice : tensor<1000x512xf16>
+      }
+      scf.yield %2 : tensor<1000x512xf16>
+    }
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %expanded = tensor.expand_shape %arg0 [[0, 1]] : tensor<1000xf32> into tensor<1x1000xf32>
+  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expanded : tensor<1x1000xf16>, tensor<1x1000xf32>) outs(%0 : tensor<1x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %2 = arith.truncf %in_0 : f32 to f16
-      %3 = arith.addf %in, %2 : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c1000 step %c1 iter_args(%arg3 = %0) -> (tensor<1x1000xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f32, %in_1: f16, %out: f16):
+        %4 = arith.truncf %in : f32 to f16
+        %5 = arith.addf %in_1, %4 : f16
+        linalg.yield %5 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[0, %arg2] [1, 1] [1, 1] : tensor<f16> into tensor<1x1000xf16>
+      scf.yield %inserted_slice : tensor<1x1000xf16>
+    }
     return %1 : tensor<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
   func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<64xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<64xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<64xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<64xf32>
+      scf.yield %inserted_slice : tensor<64xf32>
+    }
     return %1 : tensor<64xf32>
   }
-  func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
   func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<128xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<128xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<128xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<128xf32>
+      scf.yield %inserted_slice : tensor<128xf32>
+    }
     return %1 : tensor<128xf32>
   }
-  func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
   func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<256xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<256xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<256xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<256xf32>
+      scf.yield %inserted_slice : tensor<256xf32>
+    }
     return %1 : tensor<256xf32>
   }
-  func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
   func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<512xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<512xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<512xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<512xf32>
+      scf.yield %inserted_slice : tensor<512xf32>
+    }
     return %1 : tensor<512xf32>
   }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor<i64>, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor<i64>, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor<i64>, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor<i64>, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor<i64>, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor<i64>, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor<i64>, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor<i64>, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor<i64>, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor<i64>, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor<i64>, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor<i64>, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor<i64>, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor<i64>, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor<i64>, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor<i64>, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor<i64>, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor<i64>, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor<i64>, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor<i64>, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %1 = mhlo.constant dense<0xFC00> : tensor<f16>
-    %2 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16>
-    %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
-    %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16>
-    %5:3 = call @BatchNormTrainingOp2(%4, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %6 = call @Unknown3(%5#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
-    %7 = "mhlo.reduce_window"(%6, %1) ({
+    %0 = mhlo.constant dense<0xFC00> : tensor<f16>
+    %1 = call @Unknown0(%arg122) : (tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16>
+    %2 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
+    %3 = mhlo.convolution(%1, %2) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<1x64x112x112xf16>
+    %4:3 = call @BatchNormTrainingOp2(%3, %arg1, %arg0) : (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %5 = call @Unknown3(%4#0) : (tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16>
+    %6 = "mhlo.reduce_window"(%5, %0) ({
     ^bb0(%arg123: tensor<f16>, %arg124: tensor<f16>):
-      %127 = mhlo.maximum %arg123, %arg124 : tensor<f16>
-      mhlo.return %127 : tensor<f16>
+      %126 = mhlo.maximum %arg123, %arg124 : tensor<f16>
+      mhlo.return %126 : tensor<f16>
     }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<1x64x112x112xf16>, tensor<f16>) -> tensor<1x64x56x56xf16>
-    %8 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %9 = mhlo.convolution(%7, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %10:3 = call @BatchNormTrainingOp5(%9, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %11 = call @Unknown6(%10#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %12 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %13 = mhlo.convolution(%11, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %14:3 = call @BatchNormTrainingOp8(%13, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %15 = call @Unknown9(%14#0, %7) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %16 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %17 = mhlo.convolution(%15, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %18:3 = call @BatchNormTrainingOp11(%17, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %19 = call @Unknown12(%18#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %20 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %21 = mhlo.convolution(%19, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
-    %22:3 = call @BatchNormTrainingOp14(%21, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %23 = call @Unknown15(%22#0, %15) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %24 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
-    %25 = mhlo.convolution(%23, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16>
-    %26:3 = call @BatchNormTrainingOp17(%25, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %27 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
-    %28 = mhlo.convolution(%23, %27) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %29:3 = call @BatchNormTrainingOp19(%28, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %30 = call @Unknown20(%29#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %31 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %32 = mhlo.convolution(%30, %31) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %33:3 = call @BatchNormTrainingOp22(%32, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %34 = call @Unknown23(%33#0, %26#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %35 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %36 = mhlo.convolution(%34, %35) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %37:3 = call @BatchNormTrainingOp25(%36, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %38 = call @Unknown26(%37#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %39 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %40 = mhlo.convolution(%38, %39) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
-    %41:3 = call @BatchNormTrainingOp28(%40, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %42 = call @Unknown29(%41#0, %34) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %43 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
-    %44 = mhlo.convolution(%42, %43) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16>
-    %45:3 = call @BatchNormTrainingOp31(%44, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %46 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
-    %47 = mhlo.convolution(%42, %46) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %48:3 = call @BatchNormTrainingOp33(%47, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %49 = call @Unknown34(%48#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %50 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %51 = mhlo.convolution(%49, %50) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %52:3 = call @BatchNormTrainingOp36(%51, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %53 = call @Unknown37(%52#0, %45#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %54 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %55 = mhlo.convolution(%53, %54) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %56:3 = call @BatchNormTrainingOp39(%55, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %57 = call @Unknown40(%56#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %58 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %59 = mhlo.convolution(%57, %58) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
-    %60:3 = call @BatchNormTrainingOp42(%59, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %61 = call @Unknown43(%60#0, %53) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %62 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
-    %63 = mhlo.convolution(%61, %62) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16>
-    %64:3 = call @BatchNormTrainingOp45(%63, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %65 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
-    %66 = mhlo.convolution(%61, %65) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %67:3 = call @BatchNormTrainingOp47(%66, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %68 = call @Unknown48(%67#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %69 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %70 = mhlo.convolution(%68, %69) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %71:3 = call @BatchNormTrainingOp50(%70, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %72 = call @Unknown51(%71#0, %64#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %73 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %74 = mhlo.convolution(%72, %73) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %75:3 = call @BatchNormTrainingOp53(%74, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %76 = call @Unknown54(%75#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %77 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %78 = mhlo.convolution(%76, %77) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
-    %79:3 = call @BatchNormTrainingOp56(%78, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %80 = call @Unknown57(%79#0, %72) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %81 = mhlo.reduce(%80 init: %0) across dimensions = [3, 2] : (tensor<1x512x7x7xf16>, tensor<f16>) -> tensor<1x512xf16>
-     reducer(%arg123: tensor<f16>, %arg124: tensor<f16>)  {
-      %127 = mhlo.add %arg123, %arg124 : tensor<f16>
-      mhlo.return %127 : tensor<f16>
-    }
-    %82 = call @Unknown58(%81) : (tensor<1x512xf16>) -> tensor<1x512xf16>
-    %83 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %84 = "mhlo.transpose"(%83) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16>
-    %85 = "mhlo.dot_general"(%82, %83) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16>
-    %86 = call @Unknown60(%arg3, %85) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
-    %87 = call @Unknown61(%5#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %88 = call @Unknown62(%5#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %89 = call @Unknown63(%10#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %90 = call @Unknown64(%10#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %91 = call @Unknown65(%14#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %92 = call @Unknown66(%14#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %93 = call @Unknown67(%18#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %94 = call @Unknown68(%18#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %95 = call @Unknown69(%22#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %96 = call @Unknown70(%22#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %97 = call @Unknown71(%29#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %98 = call @Unknown72(%29#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %99 = call @Unknown73(%33#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %100 = call @Unknown74(%33#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %101 = call @Unknown75(%26#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %102 = call @Unknown76(%26#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %103 = call @Unknown77(%37#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %104 = call @Unknown78(%37#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %105 = call @Unknown79(%41#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %106 = call @Unknown80(%41#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %107 = call @Unknown81(%48#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %108 = call @Unknown82(%48#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %109 = call @Unknown83(%52#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %110 = call @Unknown84(%52#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %111 = call @Unknown85(%45#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %112 = call @Unknown86(%45#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %113 = call @Unknown87(%56#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %114 = call @Unknown88(%56#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %115 = call @Unknown89(%60#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %116 = call @Unknown90(%60#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %117 = call @Unknown91(%67#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %118 = call @Unknown92(%67#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %119 = call @Unknown93(%71#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %120 = call @Unknown94(%71#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %121 = call @Unknown95(%64#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %122 = call @Unknown96(%64#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %123 = call @Unknown97(%75#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %124 = call @Unknown98(%75#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %125 = call @Unknown99(%79#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %126 = call @Unknown100(%79#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    return %86, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %3, %2, %4, %6, %7, %8, %9, %11, %12, %13, %15, %16, %17, %19, %20, %21, %23, %27, %28, %30, %31, %32, %24, %25, %34, %35, %36, %38, %39, %40, %42, %46, %47, %49, %50, %51, %43, %44, %53, %54, %55, %57, %58, %59, %61, %65, %66, %68, %69, %70, %62, %63, %72, %73, %74, %76, %77, %78, %80, %82, %84 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
+    %7 = call @Unknown4(%arg9) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %8 = mhlo.convolution(%6, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %9:3 = call @BatchNormTrainingOp5(%8, %arg6, %arg5) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %10 = call @Unknown6(%9#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %11 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %12 = mhlo.convolution(%10, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %13:3 = call @BatchNormTrainingOp5(%12, %arg8, %arg7) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %14 = call @Unknown9(%13#0, %6) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %15 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %16 = mhlo.convolution(%14, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %17:3 = call @BatchNormTrainingOp5(%16, %arg12, %arg11) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %19 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %20 = mhlo.convolution(%18, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<1x64x56x56xf16>
+    %21:3 = call @BatchNormTrainingOp5(%20, %arg14, %arg13) : (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> (tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %22 = call @Unknown9(%21#0, %14) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %23 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
+    %24 = mhlo.convolution(%22, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<1x128x28x28xf16>
+    %25:3 = call @BatchNormTrainingOp17(%24, %arg25, %arg24) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %26 = call @Unknown18(%arg21) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
+    %27 = mhlo.convolution(%22, %26) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %28:3 = call @BatchNormTrainingOp17(%27, %arg18, %arg17) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %29 = call @Unknown20(%28#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %30 = call @Unknown21(%arg22) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %31 = mhlo.convolution(%29, %30) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %32:3 = call @BatchNormTrainingOp17(%31, %arg20, %arg19) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %33 = call @Unknown23(%32#0, %25#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %34 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %35 = mhlo.convolution(%33, %34) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %36:3 = call @BatchNormTrainingOp17(%35, %arg27, %arg26) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %37 = call @Unknown20(%36#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %38 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %39 = mhlo.convolution(%37, %38) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<1x128x28x28xf16>
+    %40:3 = call @BatchNormTrainingOp17(%39, %arg29, %arg28) : (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> (tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %41 = call @Unknown23(%40#0, %33) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %42 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
+    %43 = mhlo.convolution(%41, %42) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<1x256x14x14xf16>
+    %44:3 = call @BatchNormTrainingOp31(%43, %arg40, %arg39) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %45 = call @Unknown32(%arg36) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
+    %46 = mhlo.convolution(%41, %45) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %47:3 = call @BatchNormTrainingOp31(%46, %arg33, %arg32) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %48 = call @Unknown34(%47#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %49 = call @Unknown35(%arg37) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %50 = mhlo.convolution(%48, %49) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %51:3 = call @BatchNormTrainingOp31(%50, %arg35, %arg34) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %52 = call @Unknown37(%51#0, %44#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %53 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %54 = mhlo.convolution(%52, %53) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %55:3 = call @BatchNormTrainingOp31(%54, %arg42, %arg41) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %56 = call @Unknown34(%55#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %57 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %58 = mhlo.convolution(%56, %57) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<1x256x14x14xf16>
+    %59:3 = call @BatchNormTrainingOp31(%58, %arg44, %arg43) : (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> (tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %60 = call @Unknown37(%59#0, %52) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %61 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
+    %62 = mhlo.convolution(%60, %61) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<1x512x7x7xf16>
+    %63:3 = call @BatchNormTrainingOp45(%62, %arg55, %arg54) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %64 = call @Unknown46(%arg51) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
+    %65 = mhlo.convolution(%60, %64) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %66:3 = call @BatchNormTrainingOp45(%65, %arg48, %arg47) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %67 = call @Unknown48(%66#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %68 = call @Unknown49(%arg52) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %69 = mhlo.convolution(%67, %68) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %70:3 = call @BatchNormTrainingOp45(%69, %arg50, %arg49) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %71 = call @Unknown51(%70#0, %63#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %72 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %73 = mhlo.convolution(%71, %72) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %74:3 = call @BatchNormTrainingOp45(%73, %arg57, %arg56) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %75 = call @Unknown48(%74#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %76 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %77 = mhlo.convolution(%75, %76) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<1x512x7x7xf16>
+    %78:3 = call @BatchNormTrainingOp45(%77, %arg59, %arg58) : (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> (tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %79 = call @Unknown51(%78#0, %71) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %80 = call @Unknown58(%79) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16>
+    %81 = call @Unknown59(%80) : (tensor<1x512xf16>) -> tensor<1x512xf16>
+    %82 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
+    %83 = "mhlo.transpose"(%82) {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1000x512xf16>) -> tensor<512x1000xf16>
+    %84 = "mhlo.dot_general"(%81, %82) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x512xf16>, tensor<1000x512xf16>) -> tensor<1x1000xf16>
+    %85 = call @Unknown61(%arg3, %84) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
+    %86 = call @Unknown62(%4#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %87 = call @Unknown62(%4#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %88 = call @Unknown62(%9#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %89 = call @Unknown62(%9#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %90 = call @Unknown62(%13#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %91 = call @Unknown62(%13#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %92 = call @Unknown62(%17#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %93 = call @Unknown62(%17#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %94 = call @Unknown62(%21#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %95 = call @Unknown62(%21#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %96 = call @Unknown72(%28#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %97 = call @Unknown72(%28#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %98 = call @Unknown72(%32#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %99 = call @Unknown72(%32#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %100 = call @Unknown72(%25#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %101 = call @Unknown72(%25#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %102 = call @Unknown72(%36#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %103 = call @Unknown72(%36#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %104 = call @Unknown72(%40#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %105 = call @Unknown72(%40#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %106 = call @Unknown82(%47#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %107 = call @Unknown82(%47#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %108 = call @Unknown82(%51#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %109 = call @Unknown82(%51#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %110 = call @Unknown82(%44#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %111 = call @Unknown82(%44#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %112 = call @Unknown82(%55#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %113 = call @Unknown82(%55#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %114 = call @Unknown82(%59#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %115 = call @Unknown82(%59#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %116 = call @Unknown92(%66#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %117 = call @Unknown92(%66#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %118 = call @Unknown92(%70#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %119 = call @Unknown92(%70#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %120 = call @Unknown92(%63#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %121 = call @Unknown92(%63#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %122 = call @Unknown92(%74#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %123 = call @Unknown92(%74#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %124 = call @Unknown92(%78#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %125 = call @Unknown92(%78#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    return %85, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %86, %87, %88, %89, %90, %91, %92, %93, %94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105, %106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116, %117, %118, %119, %120, %121, %122, %123, %124, %125, %2, %1, %3, %5, %6, %7, %8, %10, %11, %12, %14, %15, %16, %18, %19, %20, %22, %26, %27, %29, %30, %31, %23, %24, %33, %34, %35, %37, %38, %39, %41, %45, %46, %48, %49, %50, %42, %43, %52, %53, %54, %56, %57, %58, %60, %64, %65, %67, %68, %69, %61, %62, %71, %72, %73, %75, %76, %77, %79, %81, %83 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir
index 1e0da346e..7f8aff083 100644
--- a/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/4_bufferize_opt.mlir
@@ -2,925 +2,841 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
 module {
   func.func private @Unknown0(%arg0: tensor<1x3x224x224xf32>) -> tensor<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c224 = arith.constant 224 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x3x224x224xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x3x224x224xf32>) outs(%0 : tensor<1x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1x3x224x224xf16>
+    %1 = scf.for %arg1 = %c0 to %c3 step %c1 iter_args(%arg2 = %0) -> (tensor<1x3x224x224xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c224 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x3x224x224xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x3x224x224xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x3x224x224xf32> to tensor<f32>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f32, %out: f16):
+            %6 = arith.truncf %in : f32 to f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x3x224x224xf16>
+          scf.yield %inserted_slice : tensor<1x3x224x224xf16>
+        }
+        scf.yield %3 : tensor<1x3x224x224xf16>
+      }
+      scf.yield %2 : tensor<1x3x224x224xf16>
+    }
     return %1 : tensor<1x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x3x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x3x7x7xf16>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf16>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf16>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf16>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf16>
+    }
     return %1 : tensor<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: tensor<1x64x112x112xf16>) -> tensor<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x112x112xf16>) outs(%0 : tensor<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x112x112xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x112x112xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c112 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x112x112xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c112 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x112x112xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x112x112xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x112x112xf16>
+          scf.yield %inserted_slice : tensor<1x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<1x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<1x64x112x112xf16>
+    }
     return %1 : tensor<1x64x112x112xf16>
   }
   func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf16>
+    }
     return %1 : tensor<64x64x3x3xf16>
   }
   func.func private @Unknown6(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c56 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c56 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
-  func.func private @Unknown7(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
   func.func private @Unknown9(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x64x56x56xf16>
-    return %1 : tensor<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: tensor<1x64x56x56xf16>, %arg1: tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) outs(%0 : tensor<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<1x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c56 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x64x56x56xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x64x56x56xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x64x56x56xf16>
+          scf.yield %inserted_slice : tensor<1x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<1x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<1x64x56x56xf16>
+    }
     return %1 : tensor<1x64x56x56xf16>
   }
   func.func private @Unknown16(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x1x1xf16>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf16>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf16>
+    }
     return %1 : tensor<128x64x1x1xf16>
   }
   func.func private @Unknown18(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf16>
+    }
     return %1 : tensor<128x64x3x3xf16>
   }
   func.func private @Unknown20(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c28 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c28 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown21(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf16>
+    }
     return %1 : tensor<128x128x3x3xf16>
   }
   func.func private @Unknown23(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown26(%arg0: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x128x28x28xf16>
-    return %1 : tensor<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown29(%arg0: tensor<1x128x28x28xf16>, %arg1: tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) outs(%0 : tensor<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<1x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c28 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x128x28x28xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x128x28x28xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x128x28x28xf16>
+          scf.yield %inserted_slice : tensor<1x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<1x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<1x128x28x28xf16>
+    }
     return %1 : tensor<1x128x28x28xf16>
   }
   func.func private @Unknown30(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x1x1xf16>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf16>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf16>
+    }
     return %1 : tensor<256x128x1x1xf16>
   }
   func.func private @Unknown32(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf16>
+    }
     return %1 : tensor<256x128x3x3xf16>
   }
   func.func private @Unknown34(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c14 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c14 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown35(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf16>
+    }
     return %1 : tensor<256x256x3x3xf16>
   }
   func.func private @Unknown37(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown40(%arg0: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x256x14x14xf16>
-    return %1 : tensor<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown43(%arg0: tensor<1x256x14x14xf16>, %arg1: tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) outs(%0 : tensor<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<1x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c14 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x256x14x14xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x256x14x14xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x256x14x14xf16>
+          scf.yield %inserted_slice : tensor<1x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<1x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<1x256x14x14xf16>
+    }
     return %1 : tensor<1x256x14x14xf16>
   }
   func.func private @Unknown44(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x1x1xf16>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf16>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf16>
+    }
     return %1 : tensor<512x256x1x1xf16>
   }
   func.func private @Unknown46(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf16>
+    }
     return %1 : tensor<512x256x3x3xf16>
   }
   func.func private @Unknown48(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c7 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %out: f16):
+            %6 = arith.maximumf %in, %cst : f16
+            linalg.yield %6 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg6[0, %arg1, %arg3, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
   func.func private @Unknown49(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x512x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf16>
+    }
     return %1 : tensor<512x512x3x3xf16>
   }
   func.func private @Unknown51(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<1x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c7 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<1x512x7x7xf16>) {
+          %extracted_slice = tensor.extract_slice %arg0[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<1x512x7x7xf16> to tensor<f16>
+          %4 = tensor.empty() : tensor<f16>
+          %5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%4 : tensor<f16>) {
+          ^bb0(%in: f16, %in_1: f16, %out: f16):
+            %6 = arith.addf %in, %in_1 : f16
+            %7 = arith.maximumf %6, %cst : f16
+            linalg.yield %7 : f16
+          } -> tensor<f16>
+          %inserted_slice = tensor.insert_slice %5 into %arg7[0, %arg2, %arg4, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<1x512x7x7xf16>
+          scf.yield %inserted_slice : tensor<1x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<1x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<1x512x7x7xf16>
+    }
     return %1 : tensor<1x512x7x7xf16>
   }
-  func.func private @Unknown52(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown54(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown58(%arg0: tensor<1x512x7x7xf16>) -> tensor<1x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.maxnumf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown57(%arg0: tensor<1x512x7x7xf16>, %arg1: tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<1x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) outs(%0 : tensor<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      %3 = arith.maxnumf %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x512x7x7xf16>
-    return %1 : tensor<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<1x512x7x7xf16> into tensor<512x49xf16>
+    %0 = tensor.empty() : tensor<512xf16>
+    %1 = scf.forall (%arg1) in (512) shared_outs(%arg2 = %0) -> (tensor<512xf16>) {
+      %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<512x49xf16> to tensor<49xf16>
+      %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16>
+      %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<512xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) {
+        %15 = affine.min #map1(%arg3)
+        %16 = affine.min #map2(%arg3)
+        %17 = affine.apply #map3(%16, %15)
+        %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor<?xf16>
+        %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %extracted_slice_7, %c0 : tensor<?xf16>
+        %18 = arith.cmpi ugt, %dim, %c0 : index
+        %19 = scf.if %18 -> (f16) {
+          %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %20 = arith.addf %19, %cst : f16
+        %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor<f16>) {
+        %15 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %13[%15] : tensor<2xf16>
+        %16 = arith.addf %extracted, %cst : f16
+        %17 = affine.apply #map5(%arg3)
+        %extracted_7 = tensor.extract %13[%17] : tensor<2xf16>
+        %18 = arith.addf %extracted_7, %16 : f16
+        %inserted = tensor.insert %18 into %arg4[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<512xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<512xf16> into tensor<1x512xf16>
+    return %expanded : tensor<1x512xf16>
+  }
+  func.func private @Unknown59(%arg0: tensor<1x512xf16>) -> tensor<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 2.040100e-02 : f16
     %0 = tensor.empty() : tensor<1x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1x512xf16>) outs(%0 : tensor<1x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.mulf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<1x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<1x512xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[0, %arg1] [1, 1] [1, 1] : tensor<1x512xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f16, %out: f16):
+        %4 = arith.mulf %in, %cst : f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[0, %arg1] [1, 1] [1, 1] : tensor<f16> into tensor<1x512xf16>
+      scf.yield %inserted_slice : tensor<1x512xf16>
+    }
     return %1 : tensor<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1000x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<1000x512xf16>
+        scf.yield %inserted_slice : tensor<1000x512xf16>
+      }
+      scf.yield %2 : tensor<1000x512xf16>
+    }
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %expanded = tensor.expand_shape %arg0 [[0, 1]] : tensor<1000xf32> into tensor<1x1000xf32>
+  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<1x1000xf16>) -> tensor<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expanded : tensor<1x1000xf16>, tensor<1x1000xf32>) outs(%0 : tensor<1x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %2 = arith.truncf %in_0 : f32 to f16
-      %3 = arith.addf %in, %2 : f16
-      linalg.yield %3 : f16
-    } -> tensor<1x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c1000 step %c1 iter_args(%arg3 = %0) -> (tensor<1x1000xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %extracted_slice_0 = tensor.extract_slice %arg1[0, %arg2] [1, 1] [1, 1] : tensor<1x1000xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f32>, tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f32, %in_1: f16, %out: f16):
+        %4 = arith.truncf %in : f32 to f16
+        %5 = arith.addf %in_1, %4 : f16
+        linalg.yield %5 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[0, %arg2] [1, 1] [1, 1] : tensor<f16> into tensor<1x1000xf16>
+      scf.yield %inserted_slice : tensor<1x1000xf16>
+    }
     return %1 : tensor<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
   func.func private @Unknown62(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown63(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown64(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown65(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown66(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown67(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown68(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown69(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
+    %1 = scf.for %arg2 = %c0 to %c64 step %c1 iter_args(%arg3 = %0) -> (tensor<64xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<64xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<64xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<64xf32>
+      scf.yield %inserted_slice : tensor<64xf32>
+    }
     return %1 : tensor<64xf32>
   }
-  func.func private @Unknown70(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>) -> tensor<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<64xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<64xf32>, tensor<64xf32>) outs(%0 : tensor<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<64xf32>
-    return %1 : tensor<64xf32>
-  }
-  func.func private @Unknown71(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
   func.func private @Unknown72(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown73(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
+    %1 = scf.for %arg2 = %c0 to %c128 step %c1 iter_args(%arg3 = %0) -> (tensor<128xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<128xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<128xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<128xf32>
+      scf.yield %inserted_slice : tensor<128xf32>
+    }
     return %1 : tensor<128xf32>
   }
-  func.func private @Unknown74(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown75(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown76(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown77(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown78(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown79(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown80(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<128xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<128xf32>
-    return %1 : tensor<128xf32>
-  }
-  func.func private @Unknown81(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
   func.func private @Unknown82(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown83(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown84(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown85(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown86(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
+    %1 = scf.for %arg2 = %c0 to %c256 step %c1 iter_args(%arg3 = %0) -> (tensor<256xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<256xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<256xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<256xf32>
+      scf.yield %inserted_slice : tensor<256xf32>
+    }
     return %1 : tensor<256xf32>
   }
-  func.func private @Unknown87(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown88(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown89(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown90(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<256xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<256xf32>, tensor<256xf32>) outs(%0 : tensor<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<256xf32>
-    return %1 : tensor<256xf32>
-  }
-  func.func private @Unknown91(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
   func.func private @Unknown92(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
     %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown93(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown94(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown95(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown96(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown97(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown98(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown99(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
-    return %1 : tensor<512xf32>
-  }
-  func.func private @Unknown100(%arg0: tensor<512xf32>, %arg1: tensor<512xf32>) -> tensor<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %0 = tensor.empty() : tensor<512xf32>
-    %1 = linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : tensor<512xf32>, tensor<512xf32>) outs(%0 : tensor<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %2 = arith.mulf %in_1, %cst : f32
-      %3 = arith.mulf %in, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      linalg.yield %4 : f32
-    } -> tensor<512xf32>
+    %1 = scf.for %arg2 = %c0 to %c512 step %c1 iter_args(%arg3 = %0) -> (tensor<512xf32>) {
+      %extracted_slice = tensor.extract_slice %arg1[%arg2] [1] [1] : tensor<512xf32> to tensor<f32>
+      %extracted_slice_1 = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<512xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f32>, tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %in_2: f32, %out: f32):
+        %4 = arith.mulf %in, %cst : f32
+        %5 = arith.mulf %in_2, %cst_0 : f32
+        %6 = arith.addf %5, %4 : f32
+        linalg.yield %6 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<512xf32>
+      scf.yield %inserted_slice : tensor<512xf32>
+    }
     return %1 : tensor<512xf32>
   }
   func.func @main(%arg0: tensor<64xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<1000xf32>, %arg4: tensor<1000x512xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64x64x3x3xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64x64x3x3xf32>, %arg16: tensor<64x64x3x3xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x3x3xf32>, %arg22: tensor<128x128x3x3xf32>, %arg23: tensor<128x64x1x1xf32>, %arg24: tensor<128xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128x128x3x3xf32>, %arg31: tensor<128x128x3x3xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x3x3xf32>, %arg37: tensor<256x256x3x3xf32>, %arg38: tensor<256x128x1x1xf32>, %arg39: tensor<256xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<256x256x3x3xf32>, %arg46: tensor<256x256x3x3xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x3x3xf32>, %arg52: tensor<512x512x3x3xf32>, %arg53: tensor<512x256x1x1xf32>, %arg54: tensor<512xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<512x512x3x3xf32>, %arg61: tensor<512x512x3x3xf32>, %arg62: tensor<i64>, %arg63: tensor<64xf32>, %arg64: tensor<64xf32>, %arg65: tensor<i64>, %arg66: tensor<64xf32>, %arg67: tensor<64xf32>, %arg68: tensor<i64>, %arg69: tensor<64xf32>, %arg70: tensor<64xf32>, %arg71: tensor<i64>, %arg72: tensor<64xf32>, %arg73: tensor<64xf32>, %arg74: tensor<i64>, %arg75: tensor<64xf32>, %arg76: tensor<64xf32>, %arg77: tensor<i64>, %arg78: tensor<128xf32>, %arg79: tensor<128xf32>, %arg80: tensor<i64>, %arg81: tensor<128xf32>, %arg82: tensor<128xf32>, %arg83: tensor<i64>, %arg84: tensor<128xf32>, %arg85: tensor<128xf32>, %arg86: tensor<i64>, %arg87: tensor<128xf32>, %arg88: tensor<128xf32>, %arg89: tensor<i64>, %arg90: tensor<128xf32>, %arg91: tensor<128xf32>, %arg92: tensor<i64>, %arg93: tensor<256xf32>, %arg94: tensor<256xf32>, %arg95: tensor<i64>, %arg96: tensor<256xf32>, %arg97: tensor<256xf32>, %arg98: tensor<i64>, %arg99: tensor<256xf32>, %arg100: tensor<256xf32>, %arg101: tensor<i64>, %arg102: tensor<256xf32>, %arg103: tensor<256xf32>, %arg104: tensor<i64>, %arg105: tensor<256xf32>, %arg106: tensor<256xf32>, %arg107: tensor<i64>, %arg108: tensor<512xf32>, %arg109: tensor<512xf32>, %arg110: tensor<i64>, %arg111: tensor<512xf32>, %arg112: tensor<512xf32>, %arg113: tensor<i64>, %arg114: tensor<512xf32>, %arg115: tensor<512xf32>, %arg116: tensor<i64>, %arg117: tensor<512xf32>, %arg118: tensor<512xf32>, %arg119: tensor<i64>, %arg120: tensor<512xf32>, %arg121: tensor<512xf32>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>) attributes {__placeholder__byre.entry_point} {
@@ -943,7 +859,7 @@ module {
     %16 = tensor.empty() : tensor<64xf32>
     %17:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%13, %arg6, %arg5 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%14, %15, %16 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
     %18 = call @Unknown6(%17#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %19 = call @Unknown7(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %19 = call @Unknown4(%arg10) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
     %20 = tensor.empty() : tensor<1x64x56x56xf16>
     %21 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%18, %19 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%20 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16>
     %22 = tensor.empty() : tensor<1x64x56x56xf16>
@@ -951,22 +867,22 @@ module {
     %24 = tensor.empty() : tensor<64xf32>
     %25:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%21, %arg8, %arg7 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%22, %23, %24 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
     %26 = call @Unknown9(%25#0, %10) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %27 = call @Unknown10(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %27 = call @Unknown4(%arg15) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
     %28 = tensor.empty() : tensor<1x64x56x56xf16>
     %29 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%26, %27 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%28 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16>
     %30 = tensor.empty() : tensor<1x64x56x56xf16>
     %31 = tensor.empty() : tensor<64xf32>
     %32 = tensor.empty() : tensor<64xf32>
     %33:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%29, %arg12, %arg11 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%30, %31, %32 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %34 = call @Unknown12(%33#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
-    %35 = call @Unknown13(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %34 = call @Unknown6(%33#0) : (tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %35 = call @Unknown4(%arg16) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
     %36 = tensor.empty() : tensor<1x64x56x56xf16>
     %37 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%34, %35 : tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%36 : tensor<1x64x56x56xf16>) : tensor<1x64x56x56xf16>
     %38 = tensor.empty() : tensor<1x64x56x56xf16>
     %39 = tensor.empty() : tensor<64xf32>
     %40 = tensor.empty() : tensor<64xf32>
     %41:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%37, %arg14, %arg13 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%38, %39, %40 : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<1x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %42 = call @Unknown15(%41#0, %26) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
+    %42 = call @Unknown9(%41#0, %26) : (tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>) -> tensor<1x64x56x56xf16>
     %43 = call @Unknown16(%arg23) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
     %44 = tensor.empty() : tensor<1x128x28x28xf16>
     %45 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%42, %43 : tensor<1x64x56x56xf16>, tensor<128x64x1x1xf16>) outs(%44 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16>
@@ -990,22 +906,22 @@ module {
     %63 = tensor.empty() : tensor<128xf32>
     %64:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%60, %arg20, %arg19 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%61, %62, %63 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
     %65 = call @Unknown23(%64#0, %49#0) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %66 = call @Unknown24(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %66 = call @Unknown21(%arg30) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
     %67 = tensor.empty() : tensor<1x128x28x28xf16>
     %68 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65, %66 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%67 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16>
     %69 = tensor.empty() : tensor<1x128x28x28xf16>
     %70 = tensor.empty() : tensor<128xf32>
     %71 = tensor.empty() : tensor<128xf32>
     %72:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%68, %arg27, %arg26 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%69, %70, %71 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %73 = call @Unknown26(%72#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
-    %74 = call @Unknown27(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %73 = call @Unknown20(%72#0) : (tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %74 = call @Unknown21(%arg31) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
     %75 = tensor.empty() : tensor<1x128x28x28xf16>
     %76 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%73, %74 : tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%75 : tensor<1x128x28x28xf16>) : tensor<1x128x28x28xf16>
     %77 = tensor.empty() : tensor<1x128x28x28xf16>
     %78 = tensor.empty() : tensor<128xf32>
     %79 = tensor.empty() : tensor<128xf32>
     %80:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%76, %arg29, %arg28 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%77, %78, %79 : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<1x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %81 = call @Unknown29(%80#0, %65) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
+    %81 = call @Unknown23(%80#0, %65) : (tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>) -> tensor<1x128x28x28xf16>
     %82 = call @Unknown30(%arg38) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
     %83 = tensor.empty() : tensor<1x256x14x14xf16>
     %84 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%81, %82 : tensor<1x128x28x28xf16>, tensor<256x128x1x1xf16>) outs(%83 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16>
@@ -1029,22 +945,22 @@ module {
     %102 = tensor.empty() : tensor<256xf32>
     %103:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%99, %arg35, %arg34 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%100, %101, %102 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
     %104 = call @Unknown37(%103#0, %88#0) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %105 = call @Unknown38(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %105 = call @Unknown35(%arg45) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
     %106 = tensor.empty() : tensor<1x256x14x14xf16>
     %107 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%104, %105 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%106 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16>
     %108 = tensor.empty() : tensor<1x256x14x14xf16>
     %109 = tensor.empty() : tensor<256xf32>
     %110 = tensor.empty() : tensor<256xf32>
     %111:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%107, %arg42, %arg41 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%108, %109, %110 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %112 = call @Unknown40(%111#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
-    %113 = call @Unknown41(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %112 = call @Unknown34(%111#0) : (tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %113 = call @Unknown35(%arg46) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
     %114 = tensor.empty() : tensor<1x256x14x14xf16>
     %115 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%112, %113 : tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%114 : tensor<1x256x14x14xf16>) : tensor<1x256x14x14xf16>
     %116 = tensor.empty() : tensor<1x256x14x14xf16>
     %117 = tensor.empty() : tensor<256xf32>
     %118 = tensor.empty() : tensor<256xf32>
     %119:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg44, %arg43 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%116, %117, %118 : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<1x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %120 = call @Unknown43(%119#0, %104) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
+    %120 = call @Unknown37(%119#0, %104) : (tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>) -> tensor<1x256x14x14xf16>
     %121 = call @Unknown44(%arg53) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
     %122 = tensor.empty() : tensor<1x512x7x7xf16>
     %123 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%120, %121 : tensor<1x256x14x14xf16>, tensor<512x256x1x1xf16>) outs(%122 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16>
@@ -1068,71 +984,70 @@ module {
     %141 = tensor.empty() : tensor<512xf32>
     %142:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%138, %arg50, %arg49 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%139, %140, %141 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
     %143 = call @Unknown51(%142#0, %127#0) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %144 = call @Unknown52(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %144 = call @Unknown49(%arg60) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
     %145 = tensor.empty() : tensor<1x512x7x7xf16>
     %146 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%143, %144 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%145 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16>
     %147 = tensor.empty() : tensor<1x512x7x7xf16>
     %148 = tensor.empty() : tensor<512xf32>
     %149 = tensor.empty() : tensor<512xf32>
     %150:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%146, %arg57, %arg56 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%147, %148, %149 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %151 = call @Unknown54(%150#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %152 = call @Unknown55(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %151 = call @Unknown48(%150#0) : (tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %152 = call @Unknown49(%arg61) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
     %153 = tensor.empty() : tensor<1x512x7x7xf16>
     %154 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%151, %152 : tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%153 : tensor<1x512x7x7xf16>) : tensor<1x512x7x7xf16>
     %155 = tensor.empty() : tensor<1x512x7x7xf16>
     %156 = tensor.empty() : tensor<512xf32>
     %157 = tensor.empty() : tensor<512xf32>
     %158:3 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%154, %arg59, %arg58 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%155, %156, %157 : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<1x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %159 = call @Unknown57(%158#0, %143) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
-    %160 = tensor.empty() : tensor<1x512xf16>
-    %161 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<[3, 2]> : tensor<2xi64>} ins(%159 : tensor<1x512x7x7xf16>) outs(%160 : tensor<1x512xf16>) : tensor<1x512xf16>
-    %162 = call @Unknown58(%161) : (tensor<1x512xf16>) -> tensor<1x512xf16>
-    %163 = call @Unknown59(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %164 = tensor.empty() : tensor<512x1000xf16>
-    %165 = byre.compute_on_tensor @TransposeOp_f16_f16 {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} ins(%163 : tensor<1000x512xf16>) outs(%164 : tensor<512x1000xf16>) : tensor<512x1000xf16>
-    %166 = tensor.empty() : tensor<1x1000xf16>
-    %167 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%162, %163 : tensor<1x512xf16>, tensor<1000x512xf16>) outs(%166 : tensor<1x1000xf16>) : tensor<1x1000xf16>
-    %168 = call @Unknown60(%arg3, %167) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
-    %169 = call @Unknown61(%7#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %170 = call @Unknown62(%7#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %171 = call @Unknown63(%17#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %172 = call @Unknown64(%17#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %173 = call @Unknown65(%25#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %174 = call @Unknown66(%25#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %175 = call @Unknown67(%33#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %176 = call @Unknown68(%33#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %177 = call @Unknown69(%41#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %178 = call @Unknown70(%41#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
-    %179 = call @Unknown71(%56#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %180 = call @Unknown72(%56#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %181 = call @Unknown73(%64#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %182 = call @Unknown74(%64#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %183 = call @Unknown75(%49#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %184 = call @Unknown76(%49#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %185 = call @Unknown77(%72#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %186 = call @Unknown78(%72#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %187 = call @Unknown79(%80#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %188 = call @Unknown80(%80#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
-    %189 = call @Unknown81(%95#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %190 = call @Unknown82(%95#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %191 = call @Unknown83(%103#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %192 = call @Unknown84(%103#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %193 = call @Unknown85(%88#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %194 = call @Unknown86(%88#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %195 = call @Unknown87(%111#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %196 = call @Unknown88(%111#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %197 = call @Unknown89(%119#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %198 = call @Unknown90(%119#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
-    %199 = call @Unknown91(%134#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %200 = call @Unknown92(%134#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %201 = call @Unknown93(%142#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %202 = call @Unknown94(%142#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %203 = call @Unknown95(%127#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %204 = call @Unknown96(%127#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %205 = call @Unknown97(%150#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %206 = call @Unknown98(%150#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %207 = call @Unknown99(%158#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    %208 = call @Unknown100(%158#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
-    return %168, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %169, %170, %171, %172, %173, %174, %175, %176, %177, %178, %179, %180, %181, %182, %183, %184, %185, %186, %187, %188, %189, %190, %191, %192, %193, %194, %195, %196, %197, %198, %199, %200, %201, %202, %203, %204, %205, %206, %207, %208, %1, %0, %3, %8, %10, %11, %13, %18, %19, %21, %26, %27, %29, %34, %35, %37, %42, %50, %52, %57, %58, %60, %43, %45, %65, %66, %68, %73, %74, %76, %81, %89, %91, %96, %97, %99, %82, %84, %104, %105, %107, %112, %113, %115, %120, %128, %130, %135, %136, %138, %121, %123, %143, %144, %146, %151, %152, %154, %159, %162, %165 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
-  }
-}
+    %159 = call @Unknown51(%158#0, %143) : (tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>) -> tensor<1x512x7x7xf16>
+    %160 = call @Unknown58(%159) : (tensor<1x512x7x7xf16>) -> tensor<1x512xf16>
+    %161 = call @Unknown59(%160) : (tensor<1x512xf16>) -> tensor<1x512xf16>
+    %162 = call @Unknown60(%arg4) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
+    %163 = tensor.empty() : tensor<512x1000xf16>
+    %164 = byre.compute_on_tensor @TransposeOp_f16_f16 {minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} ins(%162 : tensor<1000x512xf16>) outs(%163 : tensor<512x1000xf16>) : tensor<512x1000xf16>
+    %165 = tensor.empty() : tensor<1x1000xf16>
+    %166 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%161, %162 : tensor<1x512xf16>, tensor<1000x512xf16>) outs(%165 : tensor<1x1000xf16>) : tensor<1x1000xf16>
+    %167 = call @Unknown61(%arg3, %166) : (tensor<1000xf32>, tensor<1x1000xf16>) -> tensor<1x1000xf16>
+    %168 = call @Unknown62(%7#1, %arg63) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %169 = call @Unknown62(%7#2, %arg64) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %170 = call @Unknown62(%17#1, %arg66) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %171 = call @Unknown62(%17#2, %arg67) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %172 = call @Unknown62(%25#1, %arg69) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %173 = call @Unknown62(%25#2, %arg70) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %174 = call @Unknown62(%33#1, %arg72) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %175 = call @Unknown62(%33#2, %arg73) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %176 = call @Unknown62(%41#1, %arg75) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %177 = call @Unknown62(%41#2, %arg76) : (tensor<64xf32>, tensor<64xf32>) -> tensor<64xf32>
+    %178 = call @Unknown72(%56#1, %arg78) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %179 = call @Unknown72(%56#2, %arg79) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %180 = call @Unknown72(%64#1, %arg81) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %181 = call @Unknown72(%64#2, %arg82) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %182 = call @Unknown72(%49#1, %arg84) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %183 = call @Unknown72(%49#2, %arg85) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %184 = call @Unknown72(%72#1, %arg87) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %185 = call @Unknown72(%72#2, %arg88) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %186 = call @Unknown72(%80#1, %arg90) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %187 = call @Unknown72(%80#2, %arg91) : (tensor<128xf32>, tensor<128xf32>) -> tensor<128xf32>
+    %188 = call @Unknown82(%95#1, %arg93) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %189 = call @Unknown82(%95#2, %arg94) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %190 = call @Unknown82(%103#1, %arg96) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %191 = call @Unknown82(%103#2, %arg97) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %192 = call @Unknown82(%88#1, %arg99) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %193 = call @Unknown82(%88#2, %arg100) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %194 = call @Unknown82(%111#1, %arg102) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %195 = call @Unknown82(%111#2, %arg103) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %196 = call @Unknown82(%119#1, %arg105) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %197 = call @Unknown82(%119#2, %arg106) : (tensor<256xf32>, tensor<256xf32>) -> tensor<256xf32>
+    %198 = call @Unknown92(%134#1, %arg108) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %199 = call @Unknown92(%134#2, %arg109) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %200 = call @Unknown92(%142#1, %arg111) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %201 = call @Unknown92(%142#2, %arg112) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %202 = call @Unknown92(%127#1, %arg114) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %203 = call @Unknown92(%127#2, %arg115) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %204 = call @Unknown92(%150#1, %arg117) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %205 = call @Unknown92(%150#2, %arg118) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %206 = call @Unknown92(%158#1, %arg120) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    %207 = call @Unknown92(%158#2, %arg121) : (tensor<512xf32>, tensor<512xf32>) -> tensor<512xf32>
+    return %167, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %168, %169, %170, %171, %172, %173, %174, %175, %176, %177, %178, %179, %180, %181, %182, %183, %184, %185, %186, %187, %188, %189, %190, %191, %192, %193, %194, %195, %196, %197, %198, %199, %200, %201, %202, %203, %204, %205, %206, %207, %1, %0, %3, %8, %10, %11, %13, %18, %19, %21, %26, %27, %29, %34, %35, %37, %42, %50, %52, %57, %58, %60, %43, %45, %65, %66, %68, %73, %74, %76, %81, %89, %91, %96, %97, %99, %82, %84, %104, %105, %107, %112, %113, %115, %120, %128, %130, %135, %136, %138, %121, %123, %143, %144, %146, %151, %152, %154, %159, %161, %164 : tensor<1x1000xf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<64x3x7x7xf16>, tensor<1x3x224x224xf16>, tensor<1x64x112x112xf16>, tensor<1x64x112x112xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<64x64x3x3xf16>, tensor<1x64x56x56xf16>, tensor<1x64x56x56xf16>, tensor<128x64x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<128x64x1x1xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<128x128x3x3xf16>, tensor<1x128x28x28xf16>, tensor<1x128x28x28xf16>, tensor<256x128x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<256x128x1x1xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<256x256x3x3xf16>, tensor<1x256x14x14xf16>, tensor<1x256x14x14xf16>, tensor<512x256x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<512x256x1x1xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<512x512x3x3xf16>, tensor<1x512x7x7xf16>, tensor<1x512x7x7xf16>, tensor<1x512xf16>, tensor<512x1000xf16>
+  }
+}
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir
index f1e1e5d69..a78a25416 100644
--- a/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/5_affine_opt.mlir
@@ -2,924 +2,708 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
 module {
   func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c224 = arith.constant 224 : index
     %alloc = memref.alloc() : memref<1x3x224x224xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x3x224x224xf32>) outs(%alloc : memref<1x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c3 step %c1 {
+      scf.for %arg2 = %c0 to %c224 step %c1 {
+        scf.for %arg3 = %c0 to %c224 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf32> to memref<f32, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f32, %out: f16):
+            %0 = arith.truncf %in : f32 to f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c112 step %c1 {
+        scf.for %arg3 = %c0 to %c112 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf16>
   }
   func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c56 step %c1 {
+        scf.for %arg3 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
   func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c28 step %c1 {
+        scf.for %arg3 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf16>
   }
   func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      scf.for %arg3 = %c0 to %c28 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c14 step %c1 {
+        scf.for %arg3 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf16>
   }
   func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      scf.for %arg3 = %c0 to %c14 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c7 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16>
+    %alloc = memref.alloc() : memref<512xf16>
+    scf.forall (%arg1) in (512) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %6 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<512xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16>
+    return %expand_shape : memref<1x512xf16>
+  }
+  func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 2.040100e-02 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x512xf16>) outs(%alloc : memref<1x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.mulf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f16):
+        %0 = arith.mulf %in, %cst : f16
+        linalg.yield %0 : f16
+      }
     }
     return %alloc : memref<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %expand_shape = memref.expand_shape %arg0 [[0, 1]] : memref<1000xf32> into memref<1x1000xf32>
+  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<1x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expand_shape : memref<1x1000xf16>, memref<1x1000xf32>) outs(%alloc : memref<1x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %0 = arith.truncf %in_0 : f32 to f16
-      %1 = arith.addf %in, %0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg2] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_1 = memref.subview %arg1[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_2: f16, %out: f16):
+        %0 = arith.truncf %in : f32 to f16
+        %1 = arith.addf %in_2, %0 : f16
+        linalg.yield %1 : f16
+      }
     }
     return %alloc : memref<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
   func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<64xf32>
   }
-  func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
   func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<128xf32>
   }
-  func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
   func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<256xf32>
   }
-  func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
   func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<512xf32>
   }
@@ -943,7 +727,7 @@ module {
     %alloc_7 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_8 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<1x64x56x56xf16>
@@ -951,22 +735,22 @@ module {
     %alloc_11 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_12 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_13 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_14 = memref.alloc() : memref<64xf32>
     %alloc_15 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_16 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_17 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_18 = memref.alloc() : memref<64xf32>
     %alloc_19 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %alloc_20 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>
@@ -990,22 +774,22 @@ module {
     %alloc_31 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
     %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_32 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_33 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_34 = memref.alloc() : memref<128xf32>
     %alloc_35 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_36 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_37 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_38 = memref.alloc() : memref<128xf32>
     %alloc_39 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>
@@ -1029,22 +813,22 @@ module {
     %alloc_51 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
     %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_52 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_53 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_54 = memref.alloc() : memref<256xf32>
     %alloc_55 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_56 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_57 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_58 = memref.alloc() : memref<256xf32>
     %alloc_59 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %alloc_60 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>
@@ -1068,71 +852,70 @@ module {
     %alloc_71 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
     %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_72 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_73 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_74 = memref.alloc() : memref<512xf32>
     %alloc_75 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_76 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_77 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_78 = memref.alloc() : memref<512xf32>
     %alloc_79 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %alloc_80 = memref.alloc() : memref<1x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16>
-    %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16>
-    %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_81 = memref.alloc() : memref<512x1000xf16>
-    byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
-    %alloc_82 = memref.alloc() : memref<1x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
-    %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
-    %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
+    %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16>
+    %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16>
+    %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
+    %alloc_80 = memref.alloc() : memref<512x1000xf16>
+    byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
+    %alloc_81 = memref.alloc() : memref<1x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
+    %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
+    %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir
index 0a4ad01bd..766b56faa 100644
--- a/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/5_alternative_scf_opt.mlir
@@ -2,924 +2,708 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map2 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
 module {
   func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c224 = arith.constant 224 : index
     %alloc = memref.alloc() : memref<1x3x224x224xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x3x224x224xf32>) outs(%alloc : memref<1x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c3 step %c1 {
+      scf.for %arg2 = %c0 to %c224 step %c1 {
+        scf.for %arg3 = %c0 to %c224 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf32> to memref<f32, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x3x224x224xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f32, %out: f16):
+            %0 = arith.truncf %in : f32 to f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x112x112xf16>) outs(%alloc : memref<1x64x112x112xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c112 step %c1 {
+        scf.for %arg3 = %c0 to %c112 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf16>
   }
   func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c56 step %c1 {
+        scf.for %arg3 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
   func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) outs(%alloc : memref<1x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      scf.for %arg3 = %c0 to %c56 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c28 step %c1 {
+        scf.for %arg3 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf16>
   }
   func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) outs(%alloc : memref<1x128x28x28xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      scf.for %arg3 = %c0 to %c28 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c14 step %c1 {
+        scf.for %arg3 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf16>
   }
   func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) outs(%alloc : memref<1x256x14x14xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      scf.for %arg3 = %c0 to %c14 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c7 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg1, %arg2, %arg3] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %out: f16):
+            %0 = arith.maximumf %in, %cst : f16
+            linalg.yield %0 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.maxnumf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      scf.for %arg3 = %c0 to %c7 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          %subview = memref.subview %arg0[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_0 = memref.subview %alloc[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          %subview_1 = memref.subview %arg1[0, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+          linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+          ^bb0(%in: f16, %in_2: f16, %out: f16):
+            %0 = arith.addf %in, %in_2 : f16
+            %1 = arith.maximumf %0, %cst : f16
+            linalg.yield %1 : f16
+          }
+        }
+      }
     }
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) outs(%alloc : memref<1x512x7x7xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16>
+    %alloc = memref.alloc() : memref<512xf16>
+    scf.forall (%arg1) in (512) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %6 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<512xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16>
+    return %expand_shape : memref<1x512xf16>
+  }
+  func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 2.040100e-02 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1x512xf16>) outs(%alloc : memref<1x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.mulf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      %subview = memref.subview %arg0[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg1] [1, 1] [1, 1] : memref<1x512xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f16):
+        %0 = arith.mulf %in, %cst : f16
+        linalg.yield %0 : f16
+      }
     }
     return %alloc : memref<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %expand_shape = memref.expand_shape %arg0 [[0, 1]] : memref<1000xf32> into memref<1x1000xf32>
+  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<1x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %expand_shape : memref<1x1000xf16>, memref<1x1000xf32>) outs(%alloc : memref<1x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %0 = arith.truncf %in_0 : f32 to f16
-      %1 = arith.addf %in, %0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg2] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_1 = memref.subview %arg1[0, %arg2] [1, 1] [1, 1] : memref<1x1000xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f32, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_2: f16, %out: f16):
+        %0 = arith.truncf %in : f32 to f16
+        %1 = arith.addf %in_2, %0 : f16
+        linalg.yield %1 : f16
+      }
     }
     return %alloc : memref<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
   func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c64 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<64xf32>
   }
-  func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<64xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<64xf32>, memref<64xf32>) outs(%alloc : memref<64xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
   func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c128 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<128xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<128xf32>
   }
-  func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<128xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<128xf32>, memref<128xf32>) outs(%alloc : memref<128xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
   func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c256 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<256xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<256xf32>
   }
-  func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<256xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<256xf32>, memref<256xf32>) outs(%alloc : memref<256xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
   func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 1.000000e-01 : f32
     %cst_0 = arith.constant 0.899999976 : f32
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 1.000000e-01 : f32
-    %cst_0 = arith.constant 0.899999976 : f32
-    %alloc = memref.alloc() : memref<512xf32>
-    linalg.generic {indexing_maps = [#map2, #map2, #map2], iterator_types = ["parallel"]} ins(%arg0, %arg1 : memref<512xf32>, memref<512xf32>) outs(%alloc : memref<512xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-      %0 = arith.mulf %in_1, %cst_0 : f32
-      %1 = arith.mulf %in, %cst : f32
-      %2 = arith.addf %1, %0 : f32
-      linalg.yield %2 : f32
+    scf.for %arg2 = %c0 to %c512 step %c1 {
+      %subview = memref.subview %arg1[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_1 = memref.subview %alloc[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_2 = memref.subview %arg0[%arg2] [1] [1] : memref<512xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f32, strided<[], offset: ?>>, memref<f32, strided<[], offset: ?>>) outs(%subview_1 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %in_3: f32, %out: f32):
+        %0 = arith.mulf %in, %cst_0 : f32
+        %1 = arith.mulf %in_3, %cst : f32
+        %2 = arith.addf %1, %0 : f32
+        linalg.yield %2 : f32
+      }
     }
     return %alloc : memref<512xf32>
   }
@@ -943,7 +727,7 @@ module {
     %alloc_7 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_8 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<1x64x56x56xf16>
@@ -951,22 +735,22 @@ module {
     %alloc_11 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_12 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_13 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_14 = memref.alloc() : memref<64xf32>
     %alloc_15 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_16 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_17 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_18 = memref.alloc() : memref<64xf32>
     %alloc_19 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %alloc_20 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>
@@ -990,22 +774,22 @@ module {
     %alloc_31 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
     %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_32 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_33 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_34 = memref.alloc() : memref<128xf32>
     %alloc_35 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_36 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_37 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_38 = memref.alloc() : memref<128xf32>
     %alloc_39 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>
@@ -1029,22 +813,22 @@ module {
     %alloc_51 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
     %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_52 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_53 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_54 = memref.alloc() : memref<256xf32>
     %alloc_55 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_56 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_57 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_58 = memref.alloc() : memref<256xf32>
     %alloc_59 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %alloc_60 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>
@@ -1068,71 +852,70 @@ module {
     %alloc_71 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
     %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_72 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_73 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_74 = memref.alloc() : memref<512xf32>
     %alloc_75 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_76 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_77 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_78 = memref.alloc() : memref<512xf32>
     %alloc_79 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %alloc_80 = memref.alloc() : memref<1x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16>
-    %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16>
-    %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_81 = memref.alloc() : memref<512x1000xf16>
-    byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
-    %alloc_82 = memref.alloc() : memref<1x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
-    %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
-    %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
+    %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16>
+    %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16>
+    %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
+    %alloc_80 = memref.alloc() : memref<512x1000xf16>
+    byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
+    %alloc_81 = memref.alloc() : memref<1x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
+    %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
+    %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir
index 51d756ad5..41347f655 100644
--- a/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/6_gpu_opt.mlir
@@ -4,1492 +4,501 @@
 
 module {
   func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c224 = arith.constant 224 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c150528 = arith.constant 150528 : index
-    %c1 = arith.constant 1 : index
-    %c224 = arith.constant 224 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x3x224x224xf16>
     scf.for %arg1 = %c0 to %c150528 step %c1 {
       %0 = arith.remsi %arg1, %c224 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c224 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c224 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c224 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c224 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c224 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x3x224x224xf32>
-      %21 = arith.truncf %20 : f32 to f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x3x224x224xf16>
+      %1 = arith.divsi %arg1, %c224 : index
+      %2 = arith.remsi %1, %c224 : index
+      %3 = arith.divsi %1, %c224 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x3x224x224xf32>
+      %5 = arith.truncf %4 : f32 to f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x3x224x224xf16>
     }
     return %alloc : memref<1x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c9408 = arith.constant 9408 : index
-    %c1 = arith.constant 1 : index
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c9408 = arith.constant 9408 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
     scf.for %arg1 = %c0 to %c9408 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c3 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c3 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c3 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf16>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c3 : index
+      %5 = arith.divsi %3, %c3 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf16>
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c112 = arith.constant 112 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
-    %c112 = arith.constant 112 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x112x112xf16>
     scf.for %arg1 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg1, %c112 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c112 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c112 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c112 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c112 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c112 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x112x112xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x112x112xf16>
+      %1 = arith.divsi %arg1, %c112 : index
+      %2 = arith.remsi %1, %c112 : index
+      %3 = arith.divsi %1, %c112 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x112x112xf16>
+      %5 = arith.maximumf %4, %cst : f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x64x112x112xf16>
     }
     return %alloc : memref<1x64x112x112xf16>
   }
   func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
     %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    scf.for %arg1 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg1, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
     %c0 = arith.constant 0 : index
     %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
     scf.for %arg1 = %c0 to %c36864 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf16>
     }
     return %alloc : memref<64x64x3x3xf16>
   }
-  func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    scf.for %arg2 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-    }
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
     scf.for %arg1 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg1, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
+      %1 = arith.divsi %arg1, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %5 = arith.maximumf %4, %cst : f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
     }
     return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
+  func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x64x56x56xf16>
     scf.for %arg2 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x64x56x56xf16>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
+      %6 = arith.addf %4, %5 : f16
+      %7 = arith.maximumf %6, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x64x56x56xf16>
     }
     return %alloc : memref<1x64x56x56xf16>
   }
   func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c8192 = arith.constant 8192 : index
-    %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
     scf.for %arg1 = %c0 to %c8192 step %c1 {
       %0 = arith.remsi %arg1, %c64 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c64 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c64 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf16>
+      %1 = arith.divsi %arg1, %c64 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf16>
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c73728 = arith.constant 73728 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c73728 = arith.constant 73728 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
     scf.for %arg1 = %c0 to %c73728 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf16>
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c28 = arith.constant 28 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
     scf.for %arg1 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg1, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
+      %1 = arith.divsi %arg1, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %5 = arith.maximumf %4, %cst : f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
     %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    scf.for %arg2 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
     %c0 = arith.constant 0 : index
     %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
     scf.for %arg1 = %c0 to %c147456 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf16>
     }
     return %alloc : memref<128x128x3x3xf16>
   }
-  func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    scf.for %arg1 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg1, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-    }
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x128x28x28xf16>
     scf.for %arg2 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x128x28x28xf16>
+      %1 = arith.divsi %arg2, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
+      %6 = arith.addf %4, %5 : f16
+      %7 = arith.maximumf %6, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x128x28x28xf16>
     }
     return %alloc : memref<1x128x28x28xf16>
   }
   func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c32768 = arith.constant 32768 : index
-    %c1 = arith.constant 1 : index
-    %c128 = arith.constant 128 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
     scf.for %arg1 = %c0 to %c32768 step %c1 {
       %0 = arith.remsi %arg1, %c128 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c128 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c128 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf16>
+      %1 = arith.divsi %arg1, %c128 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf16>
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c294912 = arith.constant 294912 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c294912 = arith.constant 294912 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
     scf.for %arg1 = %c0 to %c294912 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf16>
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c14 = arith.constant 14 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
     scf.for %arg1 = %c0 to %c50176 step %c1 {
       %0 = arith.remsi %arg1, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
+      %1 = arith.divsi %arg1, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %5 = arith.maximumf %4, %cst : f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c50176 = arith.constant 50176 : index
     %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    scf.for %arg2 = %c0 to %c50176 step %c1 {
-      %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
     %c0 = arith.constant 0 : index
     %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
     scf.for %arg1 = %c0 to %c589824 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf16>
     }
     return %alloc : memref<256x256x3x3xf16>
   }
-  func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    scf.for %arg1 = %c0 to %c50176 step %c1 {
-      %0 = arith.remsi %arg1, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-    }
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c50176 = arith.constant 50176 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x256x14x14xf16>
     scf.for %arg2 = %c0 to %c50176 step %c1 {
       %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x256x14x14xf16>
+      %1 = arith.divsi %arg2, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
+      %6 = arith.addf %4, %5 : f16
+      %7 = arith.maximumf %6, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x256x14x14xf16>
     }
     return %alloc : memref<1x256x14x14xf16>
   }
   func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c131072 = arith.constant 131072 : index
-    %c1 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
     scf.for %arg1 = %c0 to %c131072 step %c1 {
       %0 = arith.remsi %arg1, %c256 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c256 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c256 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf16>
+      %1 = arith.divsi %arg1, %c256 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf16>
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c1179648 = arith.constant 1179648 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c1179648 = arith.constant 1179648 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
     scf.for %arg1 = %c0 to %c1179648 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf16>
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c7 = arith.constant 7 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
     scf.for %arg1 = %c0 to %c25088 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %5 = arith.maximumf %4, %cst : f16
+      memref.store %5, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
     }
     return %alloc : memref<1x512x7x7xf16>
   }
   func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
+    %c2359296 = arith.constant 2359296 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
     scf.for %arg1 = %c0 to %c2359296 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf16>
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+    %c7 = arith.constant 7 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1x512x7x7xf16>
     scf.for %arg2 = %c0 to %c25088 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = memref.load %arg0[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %5 = memref.load %arg1[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
+      %6 = arith.addf %4, %5 : f16
+      %7 = arith.maximumf %6, %cst : f16
+      memref.store %7, %alloc[%c0, %3, %2, %0] : memref<1x512x7x7xf16>
     }
     return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c64 = arith.constant 64 : index
+    %c49 = arith.constant 49 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16>
+    %alloc = memref.alloc() : memref<512xf16>
+    scf.forall (%arg1) in (512) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = arith.remsi %arg2, %c64 : index
+        %1 = arith.cmpi slt, %0, %c0 : index
+        %2 = arith.addi %0, %c64 : index
+        %3 = arith.select %1, %2, %0 : index
+        %4 = arith.cmpi slt, %3, %c49 : index
+        %5 = arith.select %4, %3, %c49 : index
+        %6 = arith.addi %3, %c1 : index
+        %7 = arith.cmpi slt, %6, %c49 : index
+        %8 = arith.select %7, %6, %c49 : index
+        %9 = arith.subi %8, %5 : index
+        %subview_6 = memref.subview %expand_shape_0[0, %5] [1, %9] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %10 = arith.cmpi ugt, %9, %c0 : index
+        %11 = scf.if %10 -> (f16) {
+          %13 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %13 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %12 = arith.addf %11, %cst : f16
+        memref.store %12, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<512xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16>
+    return %expand_shape : memref<1x512xf16>
+  }
+  func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
-    %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    scf.for %arg1 = %c0 to %c25088 step %c1 {
-      %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = arith.maxnumf %20, %cst : f16
-      memref.store %21, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c25088 = arith.constant 25088 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    scf.for %arg2 = %c0 to %c25088 step %c1 {
-      %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = memref.load %arg0[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %21 = memref.load %arg1[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-      %22 = arith.addf %20, %21 : f16
-      %23 = arith.maxnumf %22, %cst : f16
-      memref.store %23, %alloc[%c0, %19, %13, %3] : memref<1x512x7x7xf16>
-    }
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 2.040100e-02 : f16
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<1x512xf16>
     scf.for %arg1 = %c0 to %c512 step %c1 {
       %0 = memref.load %arg0[%c0, %arg1] : memref<1x512xf16>
@@ -1498,719 +507,98 @@ module {
     }
     return %alloc : memref<1x512xf16>
   }
-  func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c512000 = arith.constant 512000 : index
-    %c1 = arith.constant 1 : index
-    %c512 = arith.constant 512 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
     scf.for %arg1 = %c0 to %c512000 step %c1 {
       %0 = arith.remsi %arg1, %c512 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c512 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c512 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<1000x512xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3] : memref<1000x512xf16>
+      %1 = arith.divsi %arg1, %c512 : index
+      %2 = memref.load %arg0[%1, %0] : memref<1000x512xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0] : memref<1000x512xf16>
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
+  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %c1000 = arith.constant 1000 : index
     %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %alloc = memref.alloc() : memref<1x1000xf16>
     scf.for %arg2 = %c0 to %c1000 step %c1 {
-      %0 = memref.load %arg1[%c0, %arg2] : memref<1x1000xf16>
-      %1 = memref.load %arg0[%arg2] : memref<1000xf32>
-      %2 = arith.truncf %1 : f32 to f16
-      %3 = arith.addf %0, %2 : f16
+      %0 = memref.load %arg0[%arg2] : memref<1000xf32>
+      %1 = memref.load %arg1[%c0, %arg2] : memref<1x1000xf16>
+      %2 = arith.truncf %0 : f32 to f16
+      %3 = arith.addf %1, %2 : f16
       memref.store %3, %alloc[%c0, %arg2] : memref<1x1000xf16>
     }
     return %alloc : memref<1x1000xf16>
   }
-  func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
   func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
     %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
     %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<64xf32>
-    }
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c64 = arith.constant 64 : index
-    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<64xf32>
     scf.for %arg2 = %c0 to %c64 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<64xf32>
-      %1 = memref.load %arg1[%arg2] : memref<64xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
+      %0 = memref.load %arg1[%arg2] : memref<64xf32>
+      %1 = memref.load %arg0[%arg2] : memref<64xf32>
+      %2 = arith.mulf %0, %cst : f32
+      %3 = arith.mulf %1, %cst_0 : f32
       %4 = arith.addf %3, %2 : f32
       memref.store %4, %alloc[%arg2] : memref<64xf32>
     }
     return %alloc : memref<64xf32>
   }
-  func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
   func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
     %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
     %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<128xf32>
-    }
-    return %alloc : memref<128xf32>
-  }
-  func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<128xf32>
     scf.for %arg2 = %c0 to %c128 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<128xf32>
-      %1 = memref.load %arg1[%arg2] : memref<128xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
+      %0 = memref.load %arg1[%arg2] : memref<128xf32>
+      %1 = memref.load %arg0[%arg2] : memref<128xf32>
+      %2 = arith.mulf %0, %cst : f32
+      %3 = arith.mulf %1, %cst_0 : f32
       %4 = arith.addf %3, %2 : f32
       memref.store %4, %alloc[%arg2] : memref<128xf32>
     }
     return %alloc : memref<128xf32>
   }
-  func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
   func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
     %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
     %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<256xf32>
-    }
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c256 = arith.constant 256 : index
-    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<256xf32>
     scf.for %arg2 = %c0 to %c256 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<256xf32>
-      %1 = memref.load %arg1[%arg2] : memref<256xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
+      %0 = memref.load %arg1[%arg2] : memref<256xf32>
+      %1 = memref.load %arg0[%arg2] : memref<256xf32>
+      %2 = arith.mulf %0, %cst : f32
+      %3 = arith.mulf %1, %cst_0 : f32
       %4 = arith.addf %3, %2 : f32
       memref.store %4, %alloc[%arg2] : memref<256xf32>
     }
     return %alloc : memref<256xf32>
   }
-  func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
   func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
     %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
     %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.899999976 : f32
     %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
-      %4 = arith.addf %3, %2 : f32
-      memref.store %4, %alloc[%arg2] : memref<512xf32>
-    }
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.899999976 : f32
-    %cst_0 = arith.constant 1.000000e-01 : f32
-    %c0 = arith.constant 0 : index
-    %c512 = arith.constant 512 : index
-    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<512xf32>
     scf.for %arg2 = %c0 to %c512 step %c1 {
-      %0 = memref.load %arg0[%arg2] : memref<512xf32>
-      %1 = memref.load %arg1[%arg2] : memref<512xf32>
-      %2 = arith.mulf %1, %cst : f32
-      %3 = arith.mulf %0, %cst_0 : f32
+      %0 = memref.load %arg1[%arg2] : memref<512xf32>
+      %1 = memref.load %arg0[%arg2] : memref<512xf32>
+      %2 = arith.mulf %0, %cst : f32
+      %3 = arith.mulf %1, %cst_0 : f32
       %4 = arith.addf %3, %2 : f32
       memref.store %4, %alloc[%arg2] : memref<512xf32>
     }
@@ -2236,7 +624,7 @@ module {
     %alloc_7 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_8 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<1x64x56x56xf16>
@@ -2244,22 +632,22 @@ module {
     %alloc_11 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_12 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_13 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_14 = memref.alloc() : memref<64xf32>
     %alloc_15 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_16 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_17 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_18 = memref.alloc() : memref<64xf32>
     %alloc_19 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %alloc_20 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>
@@ -2283,22 +671,22 @@ module {
     %alloc_31 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
     %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_32 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_33 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_34 = memref.alloc() : memref<128xf32>
     %alloc_35 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_36 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_37 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_38 = memref.alloc() : memref<128xf32>
     %alloc_39 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>
@@ -2322,22 +710,22 @@ module {
     %alloc_51 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
     %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_52 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_53 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_54 = memref.alloc() : memref<256xf32>
     %alloc_55 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_56 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_57 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_58 = memref.alloc() : memref<256xf32>
     %alloc_59 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %alloc_60 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>
@@ -2361,71 +749,70 @@ module {
     %alloc_71 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
     %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_72 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_73 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_74 = memref.alloc() : memref<512xf32>
     %alloc_75 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_76 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_77 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_78 = memref.alloc() : memref<512xf32>
     %alloc_79 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %alloc_80 = memref.alloc() : memref<1x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16>
-    %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16>
-    %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_81 = memref.alloc() : memref<512x1000xf16>
-    byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
-    %alloc_82 = memref.alloc() : memref<1x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
-    %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
-    %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
+    %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16>
+    %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16>
+    %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
+    %alloc_80 = memref.alloc() : memref<512x1000xf16>
+    byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
+    %alloc_81 = memref.alloc() : memref<1x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
+    %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
+    %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir
index 881ac0f3e..61f68bea1 100644
--- a/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/7_set_space_opt.mlir
@@ -1,3133 +1,998 @@
-// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
+// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
 
 // CHECK-LABEL: func.func @main
 
 module attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c512 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<512xf32>
+        %8 = memref.load %arg0[%arg3] : memref<512xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c256 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<256xf32>
+        %8 = memref.load %arg0[%arg3] : memref<256xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<256xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c128 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<128xf32>
+        %8 = memref.load %arg0[%arg3] : memref<128xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<128xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c64 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<64xf32>
+        %8 = memref.load %arg0[%arg3] : memref<64xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<64xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
+      %c0 = arith.constant 0 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg3] : memref<1000xf32>
+        %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16>
+        %9 = arith.truncf %7 : f32 to f16
+        %10 = arith.addf %8, %9 : f16
+        memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
+      %cst = arith.constant 2.040100e-02 : f16
+      %c0 = arith.constant 0 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16>
+        %8 = arith.mulf %7, %cst : f16
+        memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
+      %c2359296 = arith.constant 2359296 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
+    gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
+      %c131072 = arith.constant 131072 : index
+      %c0 = arith.constant 0 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
+    gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg2, %c14 : index
+        %8 = arith.divsi %arg2, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
+      %c294912 = arith.constant 294912 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
+      %c32768 = arith.constant 32768 : index
+      %c0 = arith.constant 0 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg2, %c28 : index
+        %8 = arith.divsi %arg2, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
+      %c73728 = arith.constant 73728 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
+      %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg2, %c56 : index
+        %8 = arith.divsi %arg2, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
+    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
+      %c36864 = arith.constant 36864 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c0 = arith.constant 0 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg2, %c112 : index
+        %8 = arith.divsi %arg2, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
+      %c9408 = arith.constant 9408 : index
+      %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
+    gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel {
+      %c150528 = arith.constant 150528 : index
       %c0 = arith.constant 0 : index
-      %c1000 = arith.constant 1000 : index
+      %c224 = arith.constant 224 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16>
-        %7 = memref.load %arg0[%4] : memref<1000xf32>
-        %8 = arith.truncf %7 : f32 to f16
-        %9 = arith.addf %6, %8 : f16
-        memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c150528 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32>
+        %12 = arith.truncf %11 : f32 to f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c512000 = arith.constant 512000 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 512, 1, 1>} {
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c64 = arith.constant 64 : index
       %c0 = arith.constant 0 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16>
-        %7 = arith.mulf %6, %cst : f16
-        memref.store %7, %arg1[%c0, %4] : memref<1x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c49 = arith.constant 49 : index
+      %c1 = arith.constant 1 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
-      %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c150528 = arith.constant 150528 : index
-      %c224 = arith.constant 224 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c150528 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32>
-        %27 = arith.truncf %26 : f32 to f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16>
-      }
-      gpu.return
-    }
-  }
-  func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1176 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1176 = arith.constant 1176 : index
-    %alloc = memref.alloc() : memref<1x3x224x224xf16>
-    gpu.launch_func  @unified::@Unknown0 blocks in (%c1176, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x3x224x224xf32>, %alloc : memref<1x3x224x224xf16>)
-    return %alloc : memref<1x3x224x224xf16>
-  }
-  func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c74 = arith.constant 74 : index
-    %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    gpu.launch_func  @unified::@Unknown1 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>)
-    return %alloc : memref<64x3x7x7xf16>
-  }
-  func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
-    %alloc = memref.alloc() : memref<1x64x112x112xf16>
-    gpu.launch_func  @unified::@Unknown3 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>)
-    return %alloc : memref<1x64x112x112xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown4 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown6 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown7(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown7 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown9 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown10 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown12(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown12 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown13(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown13 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<1x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown15 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
-    return %alloc : memref<1x64x56x56xf16>
-  }
-  func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    gpu.launch_func  @unified::@Unknown16 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>)
-    return %alloc : memref<128x64x1x1xf16>
-  }
-  func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c576 = arith.constant 576 : index
-    %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown18 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>)
-    return %alloc : memref<128x64x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown20 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown21 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown23 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown24(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown24 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown26(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown26 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown27(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown27 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown29", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<1x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown29 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
-    return %alloc : memref<1x128x28x28xf16>
-  }
-  func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    gpu.launch_func  @unified::@Unknown30 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>)
-    return %alloc : memref<256x128x1x1xf16>
-  }
-  func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c2304 = arith.constant 2304 : index
-    %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown32 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>)
-    return %alloc : memref<256x128x3x3xf16>
-  }
-  func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown34 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown35 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown37 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown38(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown38 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown40(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown40", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown40 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown41(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown41 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown43", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c392 = arith.constant 392 : index
-    %alloc = memref.alloc() : memref<1x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown43 blocks in (%c392, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
-    return %alloc : memref<1x256x14x14xf16>
-  }
-  func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1024 = arith.constant 1024 : index
-    %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    gpu.launch_func  @unified::@Unknown44 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>)
-    return %alloc : memref<512x256x1x1xf16>
-  }
-  func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c9216 = arith.constant 9216 : index
-    %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown46 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>)
-    return %alloc : memref<512x256x3x3xf16>
-  }
-  func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown48 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown49 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown51 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown52(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown52", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown52 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown54(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown54", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown54 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown55 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c196 = arith.constant 196 : index
-    %alloc = memref.alloc() : memref<1x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown57 blocks in (%c196, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
-    return %alloc : memref<1x512x7x7xf16>
-  }
-  func.func private @Unknown58(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown58", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<1x512xf16>
-    gpu.launch_func  @unified::@Unknown58 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1x512xf16>, %alloc : memref<1x512xf16>)
-    return %alloc : memref<1x512xf16>
-  }
-  func.func private @Unknown59(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4000 = arith.constant 4000 : index
-    %alloc = memref.alloc() : memref<1000x512xf16>
-    gpu.launch_func  @unified::@Unknown59 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>)
-    return %alloc : memref<1000x512xf16>
-  }
-  func.func private @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
-    %alloc = memref.alloc() : memref<1x1000xf16>
-    gpu.launch_func  @unified::@Unknown60 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %arg1 : memref<1x1000xf16>, %alloc : memref<1x1000xf16>)
-    return %alloc : memref<1x1000xf16>
-  }
-  func.func private @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown61 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
-  }
-  func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+      %c32 = arith.constant 32 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.remsi %13, %c64 : index
+      %15 = arith.cmpi slt, %14, %c0 : index
+      %16 = arith.addi %14, %c64 : index
+      %17 = arith.select %15, %16, %14 : index
+      %18 = arith.cmpi slt, %17, %c49 : index
+      %19 = arith.select %18, %17, %c49 : index
+      %20 = arith.addi %17, %c1 : index
+      %21 = arith.cmpi slt, %20, %c49 : index
+      %22 = arith.select %21, %20, %c49 : index
+      %23 = arith.subi %22, %19 : index
+      %subview_0 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %24 = arith.cmpi ugt, %23, %c0 : index
+      %25 = scf.if %24 -> (f16) {
+        %33 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %33 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %26 = arith.addf %25, %cst : f16
+      memref.store %26, %alloca[%13] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %13, %c32 : index
+      scf.if %27 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca[%33] : memref<64xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca[%36] : memref<64xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_2[%13] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %13, %c16 : index
+      scf.if %28 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_2[%33] : memref<32xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_2[%36] : memref<32xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_3[%13] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %13, %c8 : index
+      scf.if %29 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_3[%33] : memref<16xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_3[%36] : memref<16xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_4[%13] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %13, %c4 : index
+      scf.if %30 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_4[%33] : memref<8xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_4[%36] : memref<8xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_5[%13] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %13, %c2 : index
+      scf.if %31 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_5[%33] : memref<4xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_5[%36] : memref<4xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_6[%13] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %32 = arith.cmpi ult, %13, %c1 : index
+      scf.if %32 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_6[%33] : memref<2xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_6[%36] : memref<2xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %arg1[%12] : memref<512xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+  }
+  func.func private @Unknown0(%arg0: memref<1x3x224x224xf32>) -> memref<1x3x224x224xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 147 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c147 = arith.constant 147 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown62 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x3x224x224xf16>
+    gpu.launch_func  @unified::@Unknown0 blocks in (%c147, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x3x224x224xf32>, %alloc : memref<1x3x224x224xf16>)
+    return %alloc : memref<1x3x224x224xf16>
   }
-  func.func private @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c10 = arith.constant 10 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown63 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<64x3x7x7xf16>
+    gpu.launch_func  @unified::@Unknown1 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>)
+    return %alloc : memref<64x3x7x7xf16>
   }
-  func.func private @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown3(%arg0: memref<1x64x112x112xf16>) -> memref<1x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown64 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x64x112x112xf16>
+    gpu.launch_func  @unified::@Unknown3 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x112x112xf16>, %alloc : memref<1x64x112x112xf16>)
+    return %alloc : memref<1x64x112x112xf16>
   }
-  func.func private @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c36 = arith.constant 36 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown65 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<64x64x3x3xf16>
+    gpu.launch_func  @unified::@Unknown4 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
+    return %alloc : memref<64x64x3x3xf16>
   }
-  func.func private @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown6(%arg0: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown66 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x64x56x56xf16>
+    gpu.launch_func  @unified::@Unknown6 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
+    return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown67", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown67 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x64x56x56xf16>
+    gpu.launch_func  @unified::@Unknown9 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x64x56x56xf16>, %arg1 : memref<1x64x56x56xf16>, %alloc : memref<1x64x56x56xf16>)
+    return %alloc : memref<1x64x56x56xf16>
   }
-  func.func private @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown16(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c8 = arith.constant 8 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown68 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<128x64x1x1xf16>
+    gpu.launch_func  @unified::@Unknown16 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>)
+    return %alloc : memref<128x64x1x1xf16>
   }
-  func.func private @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown18(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c72 = arith.constant 72 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown69 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<128x64x3x3xf16>
+    gpu.launch_func  @unified::@Unknown18 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>)
+    return %alloc : memref<128x64x3x3xf16>
   }
-  func.func private @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown20(%arg0: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<64xf32>
-    gpu.launch_func  @unified::@Unknown70 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
-    return %alloc : memref<64xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x128x28x28xf16>
+    gpu.launch_func  @unified::@Unknown20 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
+    return %alloc : memref<1x128x28x28xf16>
   }
-  func.func private @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown71", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown21(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c144 = arith.constant 144 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown71 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<128x128x3x3xf16>
+    gpu.launch_func  @unified::@Unknown21 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
+    return %alloc : memref<128x128x3x3xf16>
   }
-  func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown72 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x128x28x28xf16>
+    gpu.launch_func  @unified::@Unknown23 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x128x28x28xf16>, %arg1 : memref<1x128x28x28xf16>, %alloc : memref<1x128x28x28xf16>)
+    return %alloc : memref<1x128x28x28xf16>
   }
-  func.func private @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown30(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c32 = arith.constant 32 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown73 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<256x128x1x1xf16>
+    gpu.launch_func  @unified::@Unknown30 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>)
+    return %alloc : memref<256x128x1x1xf16>
   }
-  func.func private @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown32(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c288 = arith.constant 288 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown74 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<256x128x3x3xf16>
+    gpu.launch_func  @unified::@Unknown32 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>)
+    return %alloc : memref<256x128x3x3xf16>
   }
-  func.func private @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown75", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown34(%arg0: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c49 = arith.constant 49 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown75 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x256x14x14xf16>
+    gpu.launch_func  @unified::@Unknown34 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
+    return %alloc : memref<1x256x14x14xf16>
   }
-  func.func private @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown35(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c576 = arith.constant 576 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown76 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<256x256x3x3xf16>
+    gpu.launch_func  @unified::@Unknown35 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
+    return %alloc : memref<256x256x3x3xf16>
   }
-  func.func private @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c49 = arith.constant 49 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown77 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x256x14x14xf16>
+    gpu.launch_func  @unified::@Unknown37 blocks in (%c49, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x256x14x14xf16>, %arg1 : memref<1x256x14x14xf16>, %alloc : memref<1x256x14x14xf16>)
+    return %alloc : memref<1x256x14x14xf16>
   }
-  func.func private @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown44(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown78 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<512x256x1x1xf16>
+    gpu.launch_func  @unified::@Unknown44 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>)
+    return %alloc : memref<512x256x1x1xf16>
   }
-  func.func private @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown46(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1152 = arith.constant 1152 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown79 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<512x256x3x3xf16>
+    gpu.launch_func  @unified::@Unknown46 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>)
+    return %alloc : memref<512x256x3x3xf16>
   }
-  func.func private @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown48(%arg0: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c25 = arith.constant 25 : index
     %c1 = arith.constant 1 : index
-    %alloc = memref.alloc() : memref<128xf32>
-    gpu.launch_func  @unified::@Unknown80 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
-    return %alloc : memref<128xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x512x7x7xf16>
+    gpu.launch_func  @unified::@Unknown48 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
+    return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown49(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c2304 = arith.constant 2304 : index
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown81 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<512x512x3x3xf16>
+    gpu.launch_func  @unified::@Unknown49 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
+    return %alloc : memref<512x512x3x3xf16>
   }
-  func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c25 = arith.constant 25 : index
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown82 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x512x7x7xf16>
+    gpu.launch_func  @unified::@Unknown51 blocks in (%c25, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512x7x7xf16>, %arg1 : memref<1x512x7x7xf16>, %alloc : memref<1x512x7x7xf16>)
+    return %alloc : memref<1x512x7x7xf16>
   }
-  func.func private @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown58(%arg0: memref<1x512x7x7xf16>) -> memref<1x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c512 = arith.constant 512 : index
     %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown83 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
-  }
-  func.func private @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown84 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c49 = arith.constant 49 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<1x512x7x7xf16> into memref<512x49xf16>
+    %alloc = memref.alloc() : memref<512xf16>
+    gpu.launch_func  @unified::@Unknown58_kernel blocks in (%c512, %c1, %c1) threads in (%c64, %c1, %c1)  args(%collapse_shape : memref<512x49xf16>, %alloc : memref<512xf16>)
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<512xf16> into memref<1x512xf16>
+    return %expand_shape : memref<1x512xf16>
   }
-  func.func private @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown59(%arg0: memref<1x512xf16>) -> memref<1x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown85 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x512xf16>
+    gpu.launch_func  @unified::@Unknown59 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1x512xf16>, %alloc : memref<1x512xf16>)
+    return %alloc : memref<1x512xf16>
   }
-  func.func private @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown60(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c500 = arith.constant 500 : index
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown86 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1000x512xf16>
+    gpu.launch_func  @unified::@Unknown60 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>)
+    return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>) -> memref<1x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown87 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1x1000xf16>
+    gpu.launch_func  @unified::@Unknown61 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000xf32>, %arg1 : memref<1x1000xf16>, %alloc : memref<1x1000xf16>)
+    return %alloc : memref<1x1000xf16>
   }
-  func.func private @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>) -> memref<64xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown88 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<64xf32>
+    gpu.launch_func  @unified::@Unknown62 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64xf32>, %arg1 : memref<64xf32>, %alloc : memref<64xf32>)
+    return %alloc : memref<64xf32>
   }
-  func.func private @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>) -> memref<128xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown89 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
-    return %alloc : memref<256xf32>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<128xf32>
+    gpu.launch_func  @unified::@Unknown72 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128xf32>, %arg1 : memref<128xf32>, %alloc : memref<128xf32>)
+    return %alloc : memref<128xf32>
   }
-  func.func private @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>) -> memref<256xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256xf32>
-    gpu.launch_func  @unified::@Unknown90 blocks in (%c2, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
+    gpu.launch_func  @unified::@Unknown82 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256xf32>, %arg1 : memref<256xf32>, %alloc : memref<256xf32>)
     return %alloc : memref<256xf32>
   }
-  func.func private @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown91 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown92 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown93 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown94 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown95 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown96 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown97 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown98 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
-    %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown99 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
-    return %alloc : memref<512xf32>
-  }
-  func.func private @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown100", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>) -> memref<512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c4 = arith.constant 4 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512xf32>
-    gpu.launch_func  @unified::@Unknown100 blocks in (%c4, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
+    gpu.launch_func  @unified::@Unknown92 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512xf32>, %arg1 : memref<512xf32>, %alloc : memref<512xf32>)
     return %alloc : memref<512xf32>
   }
   func.func @main(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64x3x7x7xf32>, %arg3: memref<1000xf32>, %arg4: memref<1000x512xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64xf32>, %arg8: memref<64xf32>, %arg9: memref<64x64x3x3xf32>, %arg10: memref<64x64x3x3xf32>, %arg11: memref<64xf32>, %arg12: memref<64xf32>, %arg13: memref<64xf32>, %arg14: memref<64xf32>, %arg15: memref<64x64x3x3xf32>, %arg16: memref<64x64x3x3xf32>, %arg17: memref<128xf32>, %arg18: memref<128xf32>, %arg19: memref<128xf32>, %arg20: memref<128xf32>, %arg21: memref<128x64x3x3xf32>, %arg22: memref<128x128x3x3xf32>, %arg23: memref<128x64x1x1xf32>, %arg24: memref<128xf32>, %arg25: memref<128xf32>, %arg26: memref<128xf32>, %arg27: memref<128xf32>, %arg28: memref<128xf32>, %arg29: memref<128xf32>, %arg30: memref<128x128x3x3xf32>, %arg31: memref<128x128x3x3xf32>, %arg32: memref<256xf32>, %arg33: memref<256xf32>, %arg34: memref<256xf32>, %arg35: memref<256xf32>, %arg36: memref<256x128x3x3xf32>, %arg37: memref<256x256x3x3xf32>, %arg38: memref<256x128x1x1xf32>, %arg39: memref<256xf32>, %arg40: memref<256xf32>, %arg41: memref<256xf32>, %arg42: memref<256xf32>, %arg43: memref<256xf32>, %arg44: memref<256xf32>, %arg45: memref<256x256x3x3xf32>, %arg46: memref<256x256x3x3xf32>, %arg47: memref<512xf32>, %arg48: memref<512xf32>, %arg49: memref<512xf32>, %arg50: memref<512xf32>, %arg51: memref<512x256x3x3xf32>, %arg52: memref<512x512x3x3xf32>, %arg53: memref<512x256x1x1xf32>, %arg54: memref<512xf32>, %arg55: memref<512xf32>, %arg56: memref<512xf32>, %arg57: memref<512xf32>, %arg58: memref<512xf32>, %arg59: memref<512xf32>, %arg60: memref<512x512x3x3xf32>, %arg61: memref<512x512x3x3xf32>, %arg62: memref<i64>, %arg63: memref<64xf32>, %arg64: memref<64xf32>, %arg65: memref<i64>, %arg66: memref<64xf32>, %arg67: memref<64xf32>, %arg68: memref<i64>, %arg69: memref<64xf32>, %arg70: memref<64xf32>, %arg71: memref<i64>, %arg72: memref<64xf32>, %arg73: memref<64xf32>, %arg74: memref<i64>, %arg75: memref<64xf32>, %arg76: memref<64xf32>, %arg77: memref<i64>, %arg78: memref<128xf32>, %arg79: memref<128xf32>, %arg80: memref<i64>, %arg81: memref<128xf32>, %arg82: memref<128xf32>, %arg83: memref<i64>, %arg84: memref<128xf32>, %arg85: memref<128xf32>, %arg86: memref<i64>, %arg87: memref<128xf32>, %arg88: memref<128xf32>, %arg89: memref<i64>, %arg90: memref<128xf32>, %arg91: memref<128xf32>, %arg92: memref<i64>, %arg93: memref<256xf32>, %arg94: memref<256xf32>, %arg95: memref<i64>, %arg96: memref<256xf32>, %arg97: memref<256xf32>, %arg98: memref<i64>, %arg99: memref<256xf32>, %arg100: memref<256xf32>, %arg101: memref<i64>, %arg102: memref<256xf32>, %arg103: memref<256xf32>, %arg104: memref<i64>, %arg105: memref<256xf32>, %arg106: memref<256xf32>, %arg107: memref<i64>, %arg108: memref<512xf32>, %arg109: memref<512xf32>, %arg110: memref<i64>, %arg111: memref<512xf32>, %arg112: memref<512xf32>, %arg113: memref<i64>, %arg114: memref<512xf32>, %arg115: memref<512xf32>, %arg116: memref<i64>, %arg117: memref<512xf32>, %arg118: memref<512xf32>, %arg119: memref<i64>, %arg120: memref<512xf32>, %arg121: memref<512xf32>, %arg122: memref<1x3x224x224xf32>) -> (memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>) attributes {__placeholder__byre.entry_point} {
@@ -3150,7 +1015,7 @@ module attributes {gpu.container_module} {
     %alloc_7 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_8 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<1x64x56x56xf16>
@@ -3158,22 +1023,22 @@ module attributes {gpu.container_module} {
     %alloc_11 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
     %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_12 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_13 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_14 = memref.alloc() : memref<64xf32>
     %alloc_15 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
-    %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %alloc_16 = memref.alloc() : memref<1x64x56x56xf16>
     byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>
     %alloc_17 = memref.alloc() : memref<1x64x56x56xf16>
     %alloc_18 = memref.alloc() : memref<64xf32>
     %alloc_19 = memref.alloc() : memref<64xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<1x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
+    %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16>, memref<1x64x56x56xf16>) -> memref<1x64x56x56xf16>
     %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %alloc_20 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>
@@ -3197,22 +1062,22 @@ module attributes {gpu.container_module} {
     %alloc_31 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
     %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_32 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_33 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_34 = memref.alloc() : memref<128xf32>
     %alloc_35 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
-    %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %alloc_36 = memref.alloc() : memref<1x128x28x28xf16>
     byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>
     %alloc_37 = memref.alloc() : memref<1x128x28x28xf16>
     %alloc_38 = memref.alloc() : memref<128xf32>
     %alloc_39 = memref.alloc() : memref<128xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<1x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
+    %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16>, memref<1x128x28x28xf16>) -> memref<1x128x28x28xf16>
     %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>
@@ -3236,22 +1101,22 @@ module attributes {gpu.container_module} {
     %alloc_51 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
     %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_52 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_53 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_54 = memref.alloc() : memref<256xf32>
     %alloc_55 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
-    %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %alloc_56 = memref.alloc() : memref<1x256x14x14xf16>
     byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>
     %alloc_57 = memref.alloc() : memref<1x256x14x14xf16>
     %alloc_58 = memref.alloc() : memref<256xf32>
     %alloc_59 = memref.alloc() : memref<256xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<1x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
+    %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16>, memref<1x256x14x14xf16>) -> memref<1x256x14x14xf16>
     %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %alloc_60 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>
@@ -3275,71 +1140,70 @@ module attributes {gpu.container_module} {
     %alloc_71 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
     %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_72 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_73 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_74 = memref.alloc() : memref<512xf32>
     %alloc_75 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %alloc_76 = memref.alloc() : memref<1x512x7x7xf16>
     byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>
     %alloc_77 = memref.alloc() : memref<1x512x7x7xf16>
     %alloc_78 = memref.alloc() : memref<512xf32>
     %alloc_79 = memref.alloc() : memref<512xf32>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<1x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
-    %alloc_80 = memref.alloc() : memref<1x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16>, memref<1x512xf16>
-    %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16>) -> memref<1x512xf16>
-    %39 = call @Unknown59(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_81 = memref.alloc() : memref<512x1000xf16>
-    byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
-    %alloc_82 = memref.alloc() : memref<1x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
-    %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
-    %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
-    %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
-    %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
-    %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
-    return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
+    %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16>, memref<1x512x7x7xf16>) -> memref<1x512x7x7xf16>
+    %38 = call @Unknown58(%37) : (memref<1x512x7x7xf16>) -> memref<1x512xf16>
+    %39 = call @Unknown59(%38) : (memref<1x512xf16>) -> memref<1x512xf16>
+    %40 = call @Unknown60(%arg4) : (memref<1000x512xf32>) -> memref<1000x512xf16>
+    %alloc_80 = memref.alloc() : memref<512x1000xf16>
+    byre.compute @TransposeOp_f16_f16(%40, %alloc_80) {memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16>, memref<512x1000xf16>
+    %alloc_81 = memref.alloc() : memref<1x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%39, %40, %alloc_81) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16>, memref<1000x512xf16>, memref<1x1000xf16>
+    %41 = call @Unknown61(%arg3, %alloc_81) : (memref<1000xf32>, memref<1x1000xf16>) -> memref<1x1000xf16>
+    %42 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %43 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %44 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %45 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %46 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %47 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %48 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %49 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %50 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %51 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32>, memref<64xf32>) -> memref<64xf32>
+    %52 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %53 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %54 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %55 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %56 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %57 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %58 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %59 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %60 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %61 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32>, memref<128xf32>) -> memref<128xf32>
+    %62 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %63 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %64 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %65 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %66 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %67 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %68 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %69 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %70 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %71 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32>, memref<256xf32>) -> memref<256xf32>
+    %72 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %73 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %74 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %75 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %76 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %77 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %78 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %79 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %80 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    %81 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32>, memref<512xf32>) -> memref<512xf32>
+    return %41, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %81, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %39, %alloc_80 : memref<1x1000xf16>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<64xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<128xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<256xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<512xf32>, memref<64x3x7x7xf16>, memref<1x3x224x224xf16>, memref<1x64x112x112xf16>, memref<1x64x112x112xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<64x64x3x3xf16>, memref<1x64x56x56xf16>, memref<1x64x56x56xf16>, memref<128x64x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<128x64x1x1xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<128x128x3x3xf16>, memref<1x128x28x28xf16>, memref<1x128x28x28xf16>, memref<256x128x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<256x128x1x1xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<256x256x3x3xf16>, memref<1x256x14x14xf16>, memref<1x256x14x14xf16>, memref<512x256x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<512x256x1x1xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<512x512x3x3xf16>, memref<1x512x7x7xf16>, memref<1x512x7x7xf16>, memref<1x512xf16>, memref<512x1000xf16>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir
index 5947f8979..a109ab380 100644
--- a/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir
+++ b/compiler/test/E2E/ResNet18/FW/8_byre_opt.mlir
@@ -4,2585 +4,775 @@
 
 module attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c512 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<512xf32>
+        %8 = memref.load %arg0[%arg3] : memref<512xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1000 = arith.constant 1000 : index
+    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16>
-        %7 = memref.load %arg0[%4] : memref<1000xf32>
-        %8 = arith.truncf %7 : f32 to f16
-        %9 = arith.addf %6, %8 : f16
-        memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c256 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<256xf32>
+        %8 = memref.load %arg0[%arg3] : memref<256xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<256xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c512000 = arith.constant 512000 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c128 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<128xf32>
+        %8 = memref.load %arg0[%arg3] : memref<128xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<128xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16>
-        %7 = arith.mulf %6, %cst : f16
-        memref.store %7, %arg1[%c0, %4] : memref<1x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c64 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<64xf32>
+        %8 = memref.load %arg0[%arg3] : memref<64xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<64xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg3] : memref<1000xf32>
+        %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16>
+        %9 = arith.truncf %7 : f32 to f16
+        %10 = arith.addf %8, %9 : f16
+        memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
+    gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
+      %cst = arith.constant 2.040100e-02 : f16
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16>
+        %8 = arith.mulf %7, %cst : f16
+        memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg2, %c14 : index
+        %8 = arith.divsi %arg2, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg2, %c28 : index
+        %8 = arith.divsi %arg2, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
-      %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg2, %c56 : index
+        %8 = arith.divsi %arg2, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg2, %c112 : index
+        %8 = arith.divsi %arg2, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c150528 = arith.constant 150528 : index
+      %c0 = arith.constant 0 : index
       %c224 = arith.constant 224 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c150528 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32>
-        %27 = arith.truncf %26 : f32 to f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c150528 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32>
+        %12 = arith.truncf %11 : f32 to f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16>
       }
       gpu.return
     }
+    gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 512, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<512xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
   }
-  func.func private @Unknown0(memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1176 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown3(memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown6(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown7(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown9(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown10(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown12(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown13(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown15(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown16(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown18(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown20(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown21(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown23(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown24(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown26(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown27(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown27", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown29(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown29", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown30(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown32(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown34(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown35(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown37(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown38(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown38", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown40(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown40", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown41(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown43(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown43", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown44(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown46(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown48(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown49(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown51(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown52(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown52", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown54(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown54", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown55(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown57(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown58(memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown58", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown59(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown60(memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown61(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown62(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown63(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown64(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown65(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown65", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown66(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown67(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown67", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown68(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown69(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown70(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown71(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown71", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown72(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown73(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown73", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown74(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown75(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown75", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown76(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown77(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown77", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown78(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown79(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown79", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown80(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown80", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown81(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown81", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown82(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown83(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown84(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown84", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown85(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown85", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown86(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown86", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown87(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown88(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown88", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown89(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown90(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown90", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown91(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown92(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown93(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown94(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown94", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown95(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown96(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown96", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown97(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown97", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown98(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown98", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown99(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown99", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown100(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown100", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown0(memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 147 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown3(memref<1x64x112x112xf16, "cuda">) -> memref<1x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown6(memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown9(memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown16(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown18(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown20(memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown21(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown23(memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown30(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown32(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown34(memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown34", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown35(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown37(memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 49 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown44(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown46(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown48(memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown49(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown49", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown51(memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 25 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown51", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown59(memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown60(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown61(memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown62(memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown72(memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown82(memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown82", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown92(memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32, 1 : i32], __byre__kernel_name = "Unknown92", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
   func.func @main(%arg0: memref<64xf32, "cuda">, %arg1: memref<64xf32, "cuda">, %arg2: memref<64x3x7x7xf32, "cuda">, %arg3: memref<1000xf32, "cuda">, %arg4: memref<1000x512xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64x64x3x3xf32, "cuda">, %arg10: memref<64x64x3x3xf32, "cuda">, %arg11: memref<64xf32, "cuda">, %arg12: memref<64xf32, "cuda">, %arg13: memref<64xf32, "cuda">, %arg14: memref<64xf32, "cuda">, %arg15: memref<64x64x3x3xf32, "cuda">, %arg16: memref<64x64x3x3xf32, "cuda">, %arg17: memref<128xf32, "cuda">, %arg18: memref<128xf32, "cuda">, %arg19: memref<128xf32, "cuda">, %arg20: memref<128xf32, "cuda">, %arg21: memref<128x64x3x3xf32, "cuda">, %arg22: memref<128x128x3x3xf32, "cuda">, %arg23: memref<128x64x1x1xf32, "cuda">, %arg24: memref<128xf32, "cuda">, %arg25: memref<128xf32, "cuda">, %arg26: memref<128xf32, "cuda">, %arg27: memref<128xf32, "cuda">, %arg28: memref<128xf32, "cuda">, %arg29: memref<128xf32, "cuda">, %arg30: memref<128x128x3x3xf32, "cuda">, %arg31: memref<128x128x3x3xf32, "cuda">, %arg32: memref<256xf32, "cuda">, %arg33: memref<256xf32, "cuda">, %arg34: memref<256xf32, "cuda">, %arg35: memref<256xf32, "cuda">, %arg36: memref<256x128x3x3xf32, "cuda">, %arg37: memref<256x256x3x3xf32, "cuda">, %arg38: memref<256x128x1x1xf32, "cuda">, %arg39: memref<256xf32, "cuda">, %arg40: memref<256xf32, "cuda">, %arg41: memref<256xf32, "cuda">, %arg42: memref<256xf32, "cuda">, %arg43: memref<256xf32, "cuda">, %arg44: memref<256xf32, "cuda">, %arg45: memref<256x256x3x3xf32, "cuda">, %arg46: memref<256x256x3x3xf32, "cuda">, %arg47: memref<512xf32, "cuda">, %arg48: memref<512xf32, "cuda">, %arg49: memref<512xf32, "cuda">, %arg50: memref<512xf32, "cuda">, %arg51: memref<512x256x3x3xf32, "cuda">, %arg52: memref<512x512x3x3xf32, "cuda">, %arg53: memref<512x256x1x1xf32, "cuda">, %arg54: memref<512xf32, "cuda">, %arg55: memref<512xf32, "cuda">, %arg56: memref<512xf32, "cuda">, %arg57: memref<512xf32, "cuda">, %arg58: memref<512xf32, "cuda">, %arg59: memref<512xf32, "cuda">, %arg60: memref<512x512x3x3xf32, "cuda">, %arg61: memref<512x512x3x3xf32, "cuda">, %arg62: memref<i64, "cuda">, %arg63: memref<64xf32, "cuda">, %arg64: memref<64xf32, "cuda">, %arg65: memref<i64, "cuda">, %arg66: memref<64xf32, "cuda">, %arg67: memref<64xf32, "cuda">, %arg68: memref<i64, "cuda">, %arg69: memref<64xf32, "cuda">, %arg70: memref<64xf32, "cuda">, %arg71: memref<i64, "cuda">, %arg72: memref<64xf32, "cuda">, %arg73: memref<64xf32, "cuda">, %arg74: memref<i64, "cuda">, %arg75: memref<64xf32, "cuda">, %arg76: memref<64xf32, "cuda">, %arg77: memref<i64, "cuda">, %arg78: memref<128xf32, "cuda">, %arg79: memref<128xf32, "cuda">, %arg80: memref<i64, "cuda">, %arg81: memref<128xf32, "cuda">, %arg82: memref<128xf32, "cuda">, %arg83: memref<i64, "cuda">, %arg84: memref<128xf32, "cuda">, %arg85: memref<128xf32, "cuda">, %arg86: memref<i64, "cuda">, %arg87: memref<128xf32, "cuda">, %arg88: memref<128xf32, "cuda">, %arg89: memref<i64, "cuda">, %arg90: memref<128xf32, "cuda">, %arg91: memref<128xf32, "cuda">, %arg92: memref<i64, "cuda">, %arg93: memref<256xf32, "cuda">, %arg94: memref<256xf32, "cuda">, %arg95: memref<i64, "cuda">, %arg96: memref<256xf32, "cuda">, %arg97: memref<256xf32, "cuda">, %arg98: memref<i64, "cuda">, %arg99: memref<256xf32, "cuda">, %arg100: memref<256xf32, "cuda">, %arg101: memref<i64, "cuda">, %arg102: memref<256xf32, "cuda">, %arg103: memref<256xf32, "cuda">, %arg104: memref<i64, "cuda">, %arg105: memref<256xf32, "cuda">, %arg106: memref<256xf32, "cuda">, %arg107: memref<i64, "cuda">, %arg108: memref<512xf32, "cuda">, %arg109: memref<512xf32, "cuda">, %arg110: memref<i64, "cuda">, %arg111: memref<512xf32, "cuda">, %arg112: memref<512xf32, "cuda">, %arg113: memref<i64, "cuda">, %arg114: memref<512xf32, "cuda">, %arg115: memref<512xf32, "cuda">, %arg116: memref<i64, "cuda">, %arg117: memref<512xf32, "cuda">, %arg118: memref<512xf32, "cuda">, %arg119: memref<i64, "cuda">, %arg120: memref<512xf32, "cuda">, %arg121: memref<512xf32, "cuda">, %arg122: memref<1x3x224x224xf32, "cuda">) -> (memref<1x1000xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">, memref<512x1000xf16, "cuda">) attributes {__placeholder__byre.entry_point} {
     %0 = call @Unknown0(%arg122) : (memref<1x3x224x224xf32, "cuda">) -> memref<1x3x224x224xf16, "cuda">
     %1 = call @Unknown1(%arg2) : (memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda">
@@ -2603,7 +793,7 @@ module attributes {gpu.container_module} {
     %alloc_7 = memref.alloc() : memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_4, %arg6, %arg5, %alloc_5, %alloc_6, %alloc_7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %4 = call @Unknown6(%alloc_5) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %5 = call @Unknown7(%arg10) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %5 = call @Unknown4(%arg10) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     %alloc_8 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%4, %5, %alloc_8) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     %alloc_9 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
@@ -2611,22 +801,22 @@ module attributes {gpu.container_module} {
     %alloc_11 = memref.alloc() : memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_8, %arg8, %arg7, %alloc_9, %alloc_10, %alloc_11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %6 = call @Unknown9(%alloc_9, %alloc_3) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %7 = call @Unknown10(%arg15) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %7 = call @Unknown4(%arg15) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     %alloc_12 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%6, %7, %alloc_12) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     %alloc_13 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     %alloc_14 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_15 = memref.alloc() : memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_12, %arg12, %arg11, %alloc_13, %alloc_14, %alloc_15) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %8 = call @Unknown12(%alloc_13) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %9 = call @Unknown13(%arg16) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %8 = call @Unknown6(%alloc_13) : (memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %9 = call @Unknown4(%arg16) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     %alloc_16 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%8, %9, %alloc_16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
     %alloc_17 = memref.alloc() : memref<1x64x56x56xf16, "cuda">
     %alloc_18 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_19 = memref.alloc() : memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_16, %arg14, %arg13, %alloc_17, %alloc_18, %alloc_19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %10 = call @Unknown15(%alloc_17, %6) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %10 = call @Unknown9(%alloc_17, %6) : (memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">) -> memref<1x64x56x56xf16, "cuda">
     %11 = call @Unknown16(%arg23) : (memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda">
     %alloc_20 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%10, %11, %alloc_20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
@@ -2650,22 +840,22 @@ module attributes {gpu.container_module} {
     %alloc_31 = memref.alloc() : memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_28, %arg20, %arg19, %alloc_29, %alloc_30, %alloc_31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %15 = call @Unknown23(%alloc_29, %alloc_21) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %16 = call @Unknown24(%arg30) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %16 = call @Unknown21(%arg30) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     %alloc_32 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%15, %16, %alloc_32) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     %alloc_33 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     %alloc_34 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_35 = memref.alloc() : memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_32, %arg27, %arg26, %alloc_33, %alloc_34, %alloc_35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %17 = call @Unknown26(%alloc_33) : (memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %18 = call @Unknown27(%arg31) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %17 = call @Unknown20(%alloc_33) : (memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %18 = call @Unknown21(%arg31) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     %alloc_36 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%17, %18, %alloc_36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
     %alloc_37 = memref.alloc() : memref<1x128x28x28xf16, "cuda">
     %alloc_38 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_39 = memref.alloc() : memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_36, %arg29, %arg28, %alloc_37, %alloc_38, %alloc_39) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %19 = call @Unknown29(%alloc_37, %15) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %19 = call @Unknown23(%alloc_37, %15) : (memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">) -> memref<1x128x28x28xf16, "cuda">
     %20 = call @Unknown30(%arg38) : (memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda">
     %alloc_40 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%19, %20, %alloc_40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
@@ -2689,22 +879,22 @@ module attributes {gpu.container_module} {
     %alloc_51 = memref.alloc() : memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_48, %arg35, %arg34, %alloc_49, %alloc_50, %alloc_51) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %24 = call @Unknown37(%alloc_49, %alloc_41) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %25 = call @Unknown38(%arg45) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    %25 = call @Unknown35(%arg45) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
     %alloc_52 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%24, %25, %alloc_52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
     %alloc_53 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     %alloc_54 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_55 = memref.alloc() : memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_52, %arg42, %arg41, %alloc_53, %alloc_54, %alloc_55) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %26 = call @Unknown40(%alloc_53) : (memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %27 = call @Unknown41(%arg46) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    %26 = call @Unknown34(%alloc_53) : (memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %27 = call @Unknown35(%arg46) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
     %alloc_56 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%26, %27, %alloc_56) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
     %alloc_57 = memref.alloc() : memref<1x256x14x14xf16, "cuda">
     %alloc_58 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_59 = memref.alloc() : memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_56, %arg44, %arg43, %alloc_57, %alloc_58, %alloc_59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %28 = call @Unknown43(%alloc_57, %24) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %28 = call @Unknown37(%alloc_57, %24) : (memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">) -> memref<1x256x14x14xf16, "cuda">
     %29 = call @Unknown44(%arg53) : (memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda">
     %alloc_60 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%28, %29, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
@@ -2728,71 +918,73 @@ module attributes {gpu.container_module} {
     %alloc_71 = memref.alloc() : memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_68, %arg50, %arg49, %alloc_69, %alloc_70, %alloc_71) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %33 = call @Unknown51(%alloc_69, %alloc_61) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %34 = call @Unknown52(%arg60) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %34 = call @Unknown49(%arg60) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     %alloc_72 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%33, %34, %alloc_72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     %alloc_73 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     %alloc_74 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_75 = memref.alloc() : memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_72, %arg57, %arg56, %alloc_73, %alloc_74, %alloc_75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %35 = call @Unknown54(%alloc_73) : (memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %36 = call @Unknown55(%arg61) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %35 = call @Unknown48(%alloc_73) : (memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %36 = call @Unknown49(%arg61) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     %alloc_76 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%35, %36, %alloc_76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
     %alloc_77 = memref.alloc() : memref<1x512x7x7xf16, "cuda">
     %alloc_78 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_79 = memref.alloc() : memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%alloc_76, %arg59, %arg58, %alloc_77, %alloc_78, %alloc_79) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    %37 = call @Unknown57(%alloc_77, %33) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %alloc_80 = memref.alloc() : memref<1x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%37, %alloc_80) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">
-    %38 = call @Unknown58(%alloc_80) : (memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda">
-    %39 = call @Unknown59(%arg4) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda">
+    %37 = call @Unknown51(%alloc_77, %33) : (memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %collapse_shape = memref.collapse_shape %37 [[0, 1], [2, 3]] : memref<1x512x7x7xf16, "cuda"> into memref<512x49xf16, "cuda">
+    %alloc_80 = memref.alloc() : memref<512xf16, "cuda">
+    byre.compute @PTXOp(%collapse_shape, %alloc_80) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda">
+    %expand_shape = memref.expand_shape %alloc_80 [[0, 1]] : memref<512xf16, "cuda"> into memref<1x512xf16, "cuda">
+    %38 = call @Unknown59(%expand_shape) : (memref<1x512xf16, "cuda">) -> memref<1x512xf16, "cuda">
+    %39 = call @Unknown60(%arg4) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda">
     %alloc_81 = memref.alloc() : memref<512x1000xf16, "cuda">
     byre.compute @TransposeOp_f16_f16(%39, %alloc_81) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
     %alloc_82 = memref.alloc() : memref<1x1000xf16, "cuda">
     byre.compute @MatmulOp_f16f16_f16(%38, %39, %alloc_82) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
-    %40 = call @Unknown60(%arg3, %alloc_82) : (memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda">
-    %41 = call @Unknown61(%alloc_1, %arg63) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %40 = call @Unknown61(%arg3, %alloc_82) : (memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">) -> memref<1x1000xf16, "cuda">
+    %41 = call @Unknown62(%alloc_1, %arg63) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
     %42 = call @Unknown62(%alloc_2, %arg64) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %43 = call @Unknown63(%alloc_6, %arg66) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %44 = call @Unknown64(%alloc_7, %arg67) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %45 = call @Unknown65(%alloc_10, %arg69) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %46 = call @Unknown66(%alloc_11, %arg70) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %47 = call @Unknown67(%alloc_14, %arg72) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %48 = call @Unknown68(%alloc_15, %arg73) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %49 = call @Unknown69(%alloc_18, %arg75) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %50 = call @Unknown70(%alloc_19, %arg76) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
-    %51 = call @Unknown71(%alloc_26, %arg78) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %43 = call @Unknown62(%alloc_6, %arg66) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %44 = call @Unknown62(%alloc_7, %arg67) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %45 = call @Unknown62(%alloc_10, %arg69) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %46 = call @Unknown62(%alloc_11, %arg70) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %47 = call @Unknown62(%alloc_14, %arg72) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %48 = call @Unknown62(%alloc_15, %arg73) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %49 = call @Unknown62(%alloc_18, %arg75) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %50 = call @Unknown62(%alloc_19, %arg76) : (memref<64xf32, "cuda">, memref<64xf32, "cuda">) -> memref<64xf32, "cuda">
+    %51 = call @Unknown72(%alloc_26, %arg78) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
     %52 = call @Unknown72(%alloc_27, %arg79) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %53 = call @Unknown73(%alloc_30, %arg81) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %54 = call @Unknown74(%alloc_31, %arg82) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %55 = call @Unknown75(%alloc_22, %arg84) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %56 = call @Unknown76(%alloc_23, %arg85) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %57 = call @Unknown77(%alloc_34, %arg87) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %58 = call @Unknown78(%alloc_35, %arg88) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %59 = call @Unknown79(%alloc_38, %arg90) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %60 = call @Unknown80(%alloc_39, %arg91) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
-    %61 = call @Unknown81(%alloc_46, %arg93) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %53 = call @Unknown72(%alloc_30, %arg81) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %54 = call @Unknown72(%alloc_31, %arg82) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %55 = call @Unknown72(%alloc_22, %arg84) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %56 = call @Unknown72(%alloc_23, %arg85) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %57 = call @Unknown72(%alloc_34, %arg87) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %58 = call @Unknown72(%alloc_35, %arg88) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %59 = call @Unknown72(%alloc_38, %arg90) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %60 = call @Unknown72(%alloc_39, %arg91) : (memref<128xf32, "cuda">, memref<128xf32, "cuda">) -> memref<128xf32, "cuda">
+    %61 = call @Unknown82(%alloc_46, %arg93) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
     %62 = call @Unknown82(%alloc_47, %arg94) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %63 = call @Unknown83(%alloc_50, %arg96) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %64 = call @Unknown84(%alloc_51, %arg97) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %65 = call @Unknown85(%alloc_42, %arg99) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %66 = call @Unknown86(%alloc_43, %arg100) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %67 = call @Unknown87(%alloc_54, %arg102) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %68 = call @Unknown88(%alloc_55, %arg103) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %69 = call @Unknown89(%alloc_58, %arg105) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %70 = call @Unknown90(%alloc_59, %arg106) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
-    %71 = call @Unknown91(%alloc_66, %arg108) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %63 = call @Unknown82(%alloc_50, %arg96) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %64 = call @Unknown82(%alloc_51, %arg97) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %65 = call @Unknown82(%alloc_42, %arg99) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %66 = call @Unknown82(%alloc_43, %arg100) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %67 = call @Unknown82(%alloc_54, %arg102) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %68 = call @Unknown82(%alloc_55, %arg103) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %69 = call @Unknown82(%alloc_58, %arg105) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %70 = call @Unknown82(%alloc_59, %arg106) : (memref<256xf32, "cuda">, memref<256xf32, "cuda">) -> memref<256xf32, "cuda">
+    %71 = call @Unknown92(%alloc_66, %arg108) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
     %72 = call @Unknown92(%alloc_67, %arg109) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %73 = call @Unknown93(%alloc_70, %arg111) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %74 = call @Unknown94(%alloc_71, %arg112) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %75 = call @Unknown95(%alloc_62, %arg114) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %76 = call @Unknown96(%alloc_63, %arg115) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %77 = call @Unknown97(%alloc_74, %arg117) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %78 = call @Unknown98(%alloc_75, %arg118) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %79 = call @Unknown99(%alloc_78, %arg120) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
-    %80 = call @Unknown100(%alloc_79, %arg121) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %73 = call @Unknown92(%alloc_70, %arg111) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %74 = call @Unknown92(%alloc_71, %arg112) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %75 = call @Unknown92(%alloc_62, %arg114) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %76 = call @Unknown92(%alloc_63, %arg115) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %77 = call @Unknown92(%alloc_74, %arg117) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %78 = call @Unknown92(%alloc_75, %arg118) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %79 = call @Unknown92(%alloc_78, %arg120) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
+    %80 = call @Unknown92(%alloc_79, %arg121) : (memref<512xf32, "cuda">, memref<512xf32, "cuda">) -> memref<512xf32, "cuda">
     return %40, %arg0, %arg1, %arg5, %arg6, %arg7, %arg8, %arg11, %arg12, %arg13, %arg14, %arg17, %arg18, %arg19, %arg20, %arg24, %arg25, %arg26, %arg27, %arg28, %arg29, %arg32, %arg33, %arg34, %arg35, %arg39, %arg40, %arg41, %arg42, %arg43, %arg44, %arg47, %arg48, %arg49, %arg50, %arg54, %arg55, %arg56, %arg57, %arg58, %arg59, %41, %42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54, %55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67, %68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80, %1, %0, %alloc, %2, %alloc_3, %3, %alloc_4, %4, %5, %alloc_8, %6, %7, %alloc_12, %8, %9, %alloc_16, %10, %12, %alloc_24, %13, %14, %alloc_28, %11, %alloc_20, %15, %16, %alloc_32, %17, %18, %alloc_36, %19, %21, %alloc_44, %22, %23, %alloc_48, %20, %alloc_40, %24, %25, %alloc_52, %26, %27, %alloc_56, %28, %30, %alloc_64, %31, %32, %alloc_68, %29, %alloc_60, %33, %34, %alloc_72, %35, %36, %alloc_76, %37, %38, %alloc_81 : memref<1x1000xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x3x224x224xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">, memref<512x1000xf16, "cuda">
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir
index 0e7edc2f2..9928b1165 100644
--- a/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir
+++ b/compiler/test/E2E/ResNet18/FW/9a_byre_host.mlir
@@ -4,2682 +4,927 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c512 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<512xf32>
+        %8 = memref.load %arg0[%arg3] : memref<512xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1000 = arith.constant 1000 : index
+    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16>
-        %7 = memref.load %arg0[%4] : memref<1000xf32>
-        %8 = arith.truncf %7 : f32 to f16
-        %9 = arith.addf %6, %8 : f16
-        memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c256 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<256xf32>
+        %8 = memref.load %arg0[%arg3] : memref<256xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<256xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c512000 = arith.constant 512000 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c128 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<128xf32>
+        %8 = memref.load %arg0[%arg3] : memref<128xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<128xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16>
-        %7 = arith.mulf %6, %cst : f16
-        memref.store %7, %arg1[%c0, %4] : memref<1x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c64 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<64xf32>
+        %8 = memref.load %arg0[%arg3] : memref<64xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<64xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg3] : memref<1000xf32>
+        %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16>
+        %9 = arith.truncf %7 : f32 to f16
+        %10 = arith.addf %8, %9 : f16
+        memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
+    gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
+      %cst = arith.constant 2.040100e-02 : f16
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16>
+        %8 = arith.mulf %7, %cst : f16
+        memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg2, %c14 : index
+        %8 = arith.divsi %arg2, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg2, %c28 : index
+        %8 = arith.divsi %arg2, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
-      %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg2, %c56 : index
+        %8 = arith.divsi %arg2, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg2, %c112 : index
+        %8 = arith.divsi %arg2, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c150528 = arith.constant 150528 : index
+      %c0 = arith.constant 0 : index
       %c224 = arith.constant 224 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c150528 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32>
-        %27 = arith.truncf %26 : f32 to f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c150528 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32>
+        %12 = arith.truncf %11 : f32 to f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16>
       }
       gpu.return
     }
+    gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 512, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<512xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
   }
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<i64, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<i64, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<i64, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<i64, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<i64, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<i64, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<i64, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<i64, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<i64, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<i64, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<i64, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<i64, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<i64, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<i64, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<i64, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<i64, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<i64, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<i64, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<i64, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<i64, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
     %alloc = memref.alloc() : memref<1838592xi8, "cuda">
-    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">
-    byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
-    %49 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda">
+    byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     byre.copy(%arg0, %arg124) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg1, %arg125) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg5, %arg126) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
diff --git a/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir
index 1ed14037d..5964c86c0 100644
--- a/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir
+++ b/compiler/test/E2E/ResNet18/FW/9b_nvvm_codegen.mlir
@@ -4,2682 +4,927 @@
 
 module attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown100(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown99(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown98(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown97(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown96(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown95(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown94(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown93(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown91(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<512xf32>
-        %7 = memref.load %arg1[%4] : memref<512xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<512xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown90(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown89(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown88(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown87(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown86(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown85(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown84(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown83(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown81(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c256 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<256xf32>
-        %7 = memref.load %arg1[%4] : memref<256xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<256xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown80(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown79(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown78(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown77(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown76(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown75(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown74(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown73(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown71(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c128 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<128xf32>
-        %7 = memref.load %arg1[%4] : memref<128xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<128xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown70(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown69(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown67(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown66(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown65(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
-      %cst = arith.constant 0.899999976 : f32
-      %cst_0 = arith.constant 1.000000e-01 : f32
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown92(%arg0: memref<512xf32>, %arg1: memref<512xf32>, %arg2: memref<512xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c64 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<64xf32>
-        %7 = memref.load %arg1[%4] : memref<64xf32>
-        %8 = arith.mulf %7, %cst : f32
-        %9 = arith.mulf %6, %cst_0 : f32
-        %10 = arith.addf %9, %8 : f32
-        memref.store %10, %arg2[%4] : memref<64xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c512 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<512xf32>
+        %8 = memref.load %arg0[%arg3] : memref<512xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown60(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1000 = arith.constant 1000 : index
+    gpu.func @Unknown82(%arg0: memref<256xf32>, %arg1: memref<256xf32>, %arg2: memref<256xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg1[%c0, %4] : memref<1x1000xf16>
-        %7 = memref.load %arg0[%4] : memref<1000xf32>
-        %8 = arith.truncf %7 : f32 to f16
-        %9 = arith.addf %6, %8 : f16
-        memref.store %9, %arg2[%c0, %4] : memref<1x1000xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c256 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<256xf32>
+        %8 = memref.load %arg0[%arg3] : memref<256xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<256xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown59(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c512000 = arith.constant 512000 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown72(%arg0: memref<128xf32>, %arg1: memref<128xf32>, %arg2: memref<128xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c128 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<128xf32>
+        %8 = memref.load %arg0[%arg3] : memref<128xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<128xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown58(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c512 = arith.constant 512 : index
+    gpu.func @Unknown62(%arg0: memref<64xf32>, %arg1: memref<64xf32>, %arg2: memref<64xf32>) kernel {
+      %cst = arith.constant 1.000000e-01 : f32
+      %cst_0 = arith.constant 0.899999976 : f32
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%c0, %4] : memref<1x512xf16>
-        %7 = arith.mulf %6, %cst : f16
-        memref.store %7, %arg1[%c0, %4] : memref<1x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c64 step %6 {
+        %7 = memref.load %arg1[%arg3] : memref<64xf32>
+        %8 = memref.load %arg0[%arg3] : memref<64xf32>
+        %9 = arith.mulf %7, %cst_0 : f32
+        %10 = arith.mulf %8, %cst : f32
+        %11 = arith.addf %10, %9 : f32
+        memref.store %11, %arg2[%arg3] : memref<64xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown57(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<1x1000xf16>, %arg2: memref<1x1000xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg3] : memref<1000xf32>
+        %8 = memref.load %arg1[%c0, %arg3] : memref<1x1000xf16>
+        %9 = arith.truncf %7 : f32 to f16
+        %10 = arith.addf %8, %9 : f16
+        memref.store %10, %arg2[%c0, %arg3] : memref<1x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown55(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown60(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown54(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown52(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
+    gpu.func @Unknown59(%arg0: memref<1x512xf16>, %arg1: memref<1x512xf16>) kernel {
+      %cst = arith.constant 2.040100e-02 : f16
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512 step %6 {
+        %7 = memref.load %arg0[%c0, %arg2] : memref<1x512xf16>
+        %8 = arith.mulf %7, %cst : f16
+        memref.store %8, %arg1[%c0, %arg2] : memref<1x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown51(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>, %arg2: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown49(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown48(%arg0: memref<1x512x7x7xf16>, %arg1: memref<1x512x7x7xf16>) kernel {
+      %c25088 = arith.constant 25088 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c25088 = arith.constant 25088 : index
       %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c25088 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c25088 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x512x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown46(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown44(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown43(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown40(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown38(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown37(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>, %arg2: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown35(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown34(%arg0: memref<1x256x14x14xf16>, %arg1: memref<1x256x14x14xf16>) kernel {
+      %c50176 = arith.constant 50176 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c50176 = arith.constant 50176 : index
       %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c50176 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c50176 step %6 {
+        %7 = arith.remsi %arg2, %c14 : index
+        %8 = arith.divsi %arg2, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x256x14x14xf16>
       }
       gpu.return
     }
     gpu.func @Unknown32(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown30(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown29(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown27(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown26(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown24(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>, %arg2: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown21(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown20(%arg0: memref<1x128x28x28xf16>, %arg1: memref<1x128x28x28xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
       %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg2, %c28 : index
+        %8 = arith.divsi %arg2, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x128x28x28xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown16(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
-      %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown13(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>, %arg2: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = memref.load %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %28 = arith.addf %26, %27 : f16
-        %29 = arith.maxnumf %28, %cst : f16
-        memref.store %29, %arg2[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown7(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = memref.load %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %13 = arith.addf %11, %12 : f16
+        %14 = arith.maximumf %13, %cst : f16
+        memref.store %14, %arg2[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown6(%arg0: memref<1x64x56x56xf16>, %arg1: memref<1x64x56x56xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg2, %c56 : index
+        %8 = arith.divsi %arg2, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x56x56xf16>
       }
       gpu.return
     }
     gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown3(%arg0: memref<1x64x112x112xf16>, %arg1: memref<1x64x112x112xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
-        %27 = arith.maxnumf %26, %cst : f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg2, %c112 : index
+        %8 = arith.divsi %arg2, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
+        %12 = arith.maximumf %11, %cst : f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x64x112x112xf16>
       }
       gpu.return
     }
     gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
       %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
     gpu.func @Unknown0(%arg0: memref<1x3x224x224xf32>, %arg1: memref<1x3x224x224xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c150528 = arith.constant 150528 : index
+      %c0 = arith.constant 0 : index
       %c224 = arith.constant 224 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c150528 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = memref.load %arg0[%c0, %25, %19, %9] : memref<1x3x224x224xf32>
-        %27 = arith.truncf %26 : f32 to f16
-        memref.store %27, %arg1[%c0, %25, %19, %9] : memref<1x3x224x224xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c150528 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = memref.load %arg0[%c0, %10, %9, %7] : memref<1x3x224x224xf32>
+        %12 = arith.truncf %11 : f32 to f16
+        memref.store %12, %arg1[%c0, %10, %9, %7] : memref<1x3x224x224xf16>
       }
       gpu.return
     }
+    gpu.func @Unknown58_kernel(%arg0: memref<512x49xf16>, %arg1: memref<512xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 512, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<512x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<512xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
   }
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<i64, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<i64, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<i64, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<i64, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<i64, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<i64, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<i64, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<i64, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<i64, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<i64, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<i64, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<i64, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<i64, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<i64, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<i64, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<i64, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<i64, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<i64, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<i64, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<i64, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
     %alloc = memref.alloc() : memref<1838592xi8, "cuda">
-    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">
-    byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
-    %49 = "byre.alias"(%alloc) {offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda">
+    byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     byre.copy(%arg0, %arg124) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg1, %arg125) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg5, %arg126) {callee = "cuda2cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
diff --git a/compiler/test/E2E/ResNet18/FW/device_output.ptx b/compiler/test/E2E/ResNet18/FW/device_output.ptx
index 8b9cc4b4e..ec3082837 100644
--- a/compiler/test/E2E/ResNet18/FW/device_output.ptx
+++ b/compiler/test/E2E/ResNet18/FW/device_output.ptx
@@ -6,424 +6,14 @@
 .target sm_70
 .address_size 64
 
-	// .globl	Unknown100
-
-.visible .entry Unknown100(
-	.param .u64 Unknown100_param_0,
-	.param .u64 Unknown100_param_1,
-	.param .u64 Unknown100_param_2,
-	.param .u64 Unknown100_param_3,
-	.param .u64 Unknown100_param_4,
-	.param .u64 Unknown100_param_5,
-	.param .u64 Unknown100_param_6,
-	.param .u64 Unknown100_param_7,
-	.param .u64 Unknown100_param_8,
-	.param .u64 Unknown100_param_9,
-	.param .u64 Unknown100_param_10,
-	.param .u64 Unknown100_param_11,
-	.param .u64 Unknown100_param_12,
-	.param .u64 Unknown100_param_13,
-	.param .u64 Unknown100_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB0_2;
-	ld.param.u64 	%rd5, [Unknown100_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown100_param_1];
-	ld.param.u64 	%rd7, [Unknown100_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB0_2:
-	ret;
-
-}
-	// .globl	Unknown99
-.visible .entry Unknown99(
-	.param .u64 Unknown99_param_0,
-	.param .u64 Unknown99_param_1,
-	.param .u64 Unknown99_param_2,
-	.param .u64 Unknown99_param_3,
-	.param .u64 Unknown99_param_4,
-	.param .u64 Unknown99_param_5,
-	.param .u64 Unknown99_param_6,
-	.param .u64 Unknown99_param_7,
-	.param .u64 Unknown99_param_8,
-	.param .u64 Unknown99_param_9,
-	.param .u64 Unknown99_param_10,
-	.param .u64 Unknown99_param_11,
-	.param .u64 Unknown99_param_12,
-	.param .u64 Unknown99_param_13,
-	.param .u64 Unknown99_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB1_2;
-	ld.param.u64 	%rd5, [Unknown99_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown99_param_1];
-	ld.param.u64 	%rd7, [Unknown99_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB1_2:
-	ret;
-
-}
-	// .globl	Unknown98
-.visible .entry Unknown98(
-	.param .u64 Unknown98_param_0,
-	.param .u64 Unknown98_param_1,
-	.param .u64 Unknown98_param_2,
-	.param .u64 Unknown98_param_3,
-	.param .u64 Unknown98_param_4,
-	.param .u64 Unknown98_param_5,
-	.param .u64 Unknown98_param_6,
-	.param .u64 Unknown98_param_7,
-	.param .u64 Unknown98_param_8,
-	.param .u64 Unknown98_param_9,
-	.param .u64 Unknown98_param_10,
-	.param .u64 Unknown98_param_11,
-	.param .u64 Unknown98_param_12,
-	.param .u64 Unknown98_param_13,
-	.param .u64 Unknown98_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB2_2;
-	ld.param.u64 	%rd5, [Unknown98_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown98_param_1];
-	ld.param.u64 	%rd7, [Unknown98_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB2_2:
-	ret;
-
-}
-	// .globl	Unknown97
-.visible .entry Unknown97(
-	.param .u64 Unknown97_param_0,
-	.param .u64 Unknown97_param_1,
-	.param .u64 Unknown97_param_2,
-	.param .u64 Unknown97_param_3,
-	.param .u64 Unknown97_param_4,
-	.param .u64 Unknown97_param_5,
-	.param .u64 Unknown97_param_6,
-	.param .u64 Unknown97_param_7,
-	.param .u64 Unknown97_param_8,
-	.param .u64 Unknown97_param_9,
-	.param .u64 Unknown97_param_10,
-	.param .u64 Unknown97_param_11,
-	.param .u64 Unknown97_param_12,
-	.param .u64 Unknown97_param_13,
-	.param .u64 Unknown97_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB3_2;
-	ld.param.u64 	%rd5, [Unknown97_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown97_param_1];
-	ld.param.u64 	%rd7, [Unknown97_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB3_2:
-	ret;
-
-}
-	// .globl	Unknown96
-.visible .entry Unknown96(
-	.param .u64 Unknown96_param_0,
-	.param .u64 Unknown96_param_1,
-	.param .u64 Unknown96_param_2,
-	.param .u64 Unknown96_param_3,
-	.param .u64 Unknown96_param_4,
-	.param .u64 Unknown96_param_5,
-	.param .u64 Unknown96_param_6,
-	.param .u64 Unknown96_param_7,
-	.param .u64 Unknown96_param_8,
-	.param .u64 Unknown96_param_9,
-	.param .u64 Unknown96_param_10,
-	.param .u64 Unknown96_param_11,
-	.param .u64 Unknown96_param_12,
-	.param .u64 Unknown96_param_13,
-	.param .u64 Unknown96_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB4_2;
-	ld.param.u64 	%rd5, [Unknown96_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown96_param_1];
-	ld.param.u64 	%rd7, [Unknown96_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB4_2:
-	ret;
-
-}
-	// .globl	Unknown95
-.visible .entry Unknown95(
-	.param .u64 Unknown95_param_0,
-	.param .u64 Unknown95_param_1,
-	.param .u64 Unknown95_param_2,
-	.param .u64 Unknown95_param_3,
-	.param .u64 Unknown95_param_4,
-	.param .u64 Unknown95_param_5,
-	.param .u64 Unknown95_param_6,
-	.param .u64 Unknown95_param_7,
-	.param .u64 Unknown95_param_8,
-	.param .u64 Unknown95_param_9,
-	.param .u64 Unknown95_param_10,
-	.param .u64 Unknown95_param_11,
-	.param .u64 Unknown95_param_12,
-	.param .u64 Unknown95_param_13,
-	.param .u64 Unknown95_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB5_2;
-	ld.param.u64 	%rd5, [Unknown95_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown95_param_1];
-	ld.param.u64 	%rd7, [Unknown95_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB5_2:
-	ret;
-
-}
-	// .globl	Unknown94
-.visible .entry Unknown94(
-	.param .u64 Unknown94_param_0,
-	.param .u64 Unknown94_param_1,
-	.param .u64 Unknown94_param_2,
-	.param .u64 Unknown94_param_3,
-	.param .u64 Unknown94_param_4,
-	.param .u64 Unknown94_param_5,
-	.param .u64 Unknown94_param_6,
-	.param .u64 Unknown94_param_7,
-	.param .u64 Unknown94_param_8,
-	.param .u64 Unknown94_param_9,
-	.param .u64 Unknown94_param_10,
-	.param .u64 Unknown94_param_11,
-	.param .u64 Unknown94_param_12,
-	.param .u64 Unknown94_param_13,
-	.param .u64 Unknown94_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB6_2;
-	ld.param.u64 	%rd5, [Unknown94_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown94_param_1];
-	ld.param.u64 	%rd7, [Unknown94_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB6_2:
-	ret;
-
-}
-	// .globl	Unknown93
-.visible .entry Unknown93(
-	.param .u64 Unknown93_param_0,
-	.param .u64 Unknown93_param_1,
-	.param .u64 Unknown93_param_2,
-	.param .u64 Unknown93_param_3,
-	.param .u64 Unknown93_param_4,
-	.param .u64 Unknown93_param_5,
-	.param .u64 Unknown93_param_6,
-	.param .u64 Unknown93_param_7,
-	.param .u64 Unknown93_param_8,
-	.param .u64 Unknown93_param_9,
-	.param .u64 Unknown93_param_10,
-	.param .u64 Unknown93_param_11,
-	.param .u64 Unknown93_param_12,
-	.param .u64 Unknown93_param_13,
-	.param .u64 Unknown93_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB7_2;
-	ld.param.u64 	%rd5, [Unknown93_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown93_param_1];
-	ld.param.u64 	%rd7, [Unknown93_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB7_2:
-	ret;
-
-}
 	// .globl	Unknown92
+// __wg_Unknown58_kernel_0 has been demoted
+// __wg_Unknown58_kernel_1 has been demoted
+// __wg_Unknown58_kernel_2 has been demoted
+// __wg_Unknown58_kernel_3 has been demoted
+// __wg_Unknown58_kernel_4 has been demoted
+// __wg_Unknown58_kernel_5 has been demoted
+
 .visible .entry Unknown92(
 	.param .u64 Unknown92_param_0,
 	.param .u64 Unknown92_param_1,
@@ -442,504 +32,44 @@ $L__BB7_2:
 	.param .u64 Unknown92_param_14
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB8_2;
-	ld.param.u64 	%rd5, [Unknown92_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown92_param_1];
-	ld.param.u64 	%rd7, [Unknown92_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB8_2:
-	ret;
-
-}
-	// .globl	Unknown91
-.visible .entry Unknown91(
-	.param .u64 Unknown91_param_0,
-	.param .u64 Unknown91_param_1,
-	.param .u64 Unknown91_param_2,
-	.param .u64 Unknown91_param_3,
-	.param .u64 Unknown91_param_4,
-	.param .u64 Unknown91_param_5,
-	.param .u64 Unknown91_param_6,
-	.param .u64 Unknown91_param_7,
-	.param .u64 Unknown91_param_8,
-	.param .u64 Unknown91_param_9,
-	.param .u64 Unknown91_param_10,
-	.param .u64 Unknown91_param_11,
-	.param .u64 Unknown91_param_12,
-	.param .u64 Unknown91_param_13,
-	.param .u64 Unknown91_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 511;
-	@%p1 bra 	$L__BB9_2;
-	ld.param.u64 	%rd5, [Unknown91_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown91_param_1];
-	ld.param.u64 	%rd7, [Unknown91_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB9_2:
-	ret;
-
-}
-	// .globl	Unknown90
-.visible .entry Unknown90(
-	.param .u64 Unknown90_param_0,
-	.param .u64 Unknown90_param_1,
-	.param .u64 Unknown90_param_2,
-	.param .u64 Unknown90_param_3,
-	.param .u64 Unknown90_param_4,
-	.param .u64 Unknown90_param_5,
-	.param .u64 Unknown90_param_6,
-	.param .u64 Unknown90_param_7,
-	.param .u64 Unknown90_param_8,
-	.param .u64 Unknown90_param_9,
-	.param .u64 Unknown90_param_10,
-	.param .u64 Unknown90_param_11,
-	.param .u64 Unknown90_param_12,
-	.param .u64 Unknown90_param_13,
-	.param .u64 Unknown90_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB10_2;
-	ld.param.u64 	%rd5, [Unknown90_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown90_param_1];
-	ld.param.u64 	%rd7, [Unknown90_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB10_2:
-	ret;
-
-}
-	// .globl	Unknown89
-.visible .entry Unknown89(
-	.param .u64 Unknown89_param_0,
-	.param .u64 Unknown89_param_1,
-	.param .u64 Unknown89_param_2,
-	.param .u64 Unknown89_param_3,
-	.param .u64 Unknown89_param_4,
-	.param .u64 Unknown89_param_5,
-	.param .u64 Unknown89_param_6,
-	.param .u64 Unknown89_param_7,
-	.param .u64 Unknown89_param_8,
-	.param .u64 Unknown89_param_9,
-	.param .u64 Unknown89_param_10,
-	.param .u64 Unknown89_param_11,
-	.param .u64 Unknown89_param_12,
-	.param .u64 Unknown89_param_13,
-	.param .u64 Unknown89_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB11_2;
-	ld.param.u64 	%rd5, [Unknown89_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown89_param_1];
-	ld.param.u64 	%rd7, [Unknown89_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB11_2:
-	ret;
-
-}
-	// .globl	Unknown88
-.visible .entry Unknown88(
-	.param .u64 Unknown88_param_0,
-	.param .u64 Unknown88_param_1,
-	.param .u64 Unknown88_param_2,
-	.param .u64 Unknown88_param_3,
-	.param .u64 Unknown88_param_4,
-	.param .u64 Unknown88_param_5,
-	.param .u64 Unknown88_param_6,
-	.param .u64 Unknown88_param_7,
-	.param .u64 Unknown88_param_8,
-	.param .u64 Unknown88_param_9,
-	.param .u64 Unknown88_param_10,
-	.param .u64 Unknown88_param_11,
-	.param .u64 Unknown88_param_12,
-	.param .u64 Unknown88_param_13,
-	.param .u64 Unknown88_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB12_2;
-	ld.param.u64 	%rd5, [Unknown88_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown88_param_1];
-	ld.param.u64 	%rd7, [Unknown88_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB12_2:
-	ret;
-
-}
-	// .globl	Unknown87
-.visible .entry Unknown87(
-	.param .u64 Unknown87_param_0,
-	.param .u64 Unknown87_param_1,
-	.param .u64 Unknown87_param_2,
-	.param .u64 Unknown87_param_3,
-	.param .u64 Unknown87_param_4,
-	.param .u64 Unknown87_param_5,
-	.param .u64 Unknown87_param_6,
-	.param .u64 Unknown87_param_7,
-	.param .u64 Unknown87_param_8,
-	.param .u64 Unknown87_param_9,
-	.param .u64 Unknown87_param_10,
-	.param .u64 Unknown87_param_11,
-	.param .u64 Unknown87_param_12,
-	.param .u64 Unknown87_param_13,
-	.param .u64 Unknown87_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB13_2;
-	ld.param.u64 	%rd5, [Unknown87_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown87_param_1];
-	ld.param.u64 	%rd7, [Unknown87_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB13_2:
-	ret;
-
-}
-	// .globl	Unknown86
-.visible .entry Unknown86(
-	.param .u64 Unknown86_param_0,
-	.param .u64 Unknown86_param_1,
-	.param .u64 Unknown86_param_2,
-	.param .u64 Unknown86_param_3,
-	.param .u64 Unknown86_param_4,
-	.param .u64 Unknown86_param_5,
-	.param .u64 Unknown86_param_6,
-	.param .u64 Unknown86_param_7,
-	.param .u64 Unknown86_param_8,
-	.param .u64 Unknown86_param_9,
-	.param .u64 Unknown86_param_10,
-	.param .u64 Unknown86_param_11,
-	.param .u64 Unknown86_param_12,
-	.param .u64 Unknown86_param_13,
-	.param .u64 Unknown86_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB14_2;
-	ld.param.u64 	%rd5, [Unknown86_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown86_param_1];
-	ld.param.u64 	%rd7, [Unknown86_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB14_2:
-	ret;
-
-}
-	// .globl	Unknown85
-.visible .entry Unknown85(
-	.param .u64 Unknown85_param_0,
-	.param .u64 Unknown85_param_1,
-	.param .u64 Unknown85_param_2,
-	.param .u64 Unknown85_param_3,
-	.param .u64 Unknown85_param_4,
-	.param .u64 Unknown85_param_5,
-	.param .u64 Unknown85_param_6,
-	.param .u64 Unknown85_param_7,
-	.param .u64 Unknown85_param_8,
-	.param .u64 Unknown85_param_9,
-	.param .u64 Unknown85_param_10,
-	.param .u64 Unknown85_param_11,
-	.param .u64 Unknown85_param_12,
-	.param .u64 Unknown85_param_13,
-	.param .u64 Unknown85_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB15_2;
-	ld.param.u64 	%rd5, [Unknown85_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown85_param_1];
-	ld.param.u64 	%rd7, [Unknown85_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB15_2:
-	ret;
-
-}
-	// .globl	Unknown84
-.visible .entry Unknown84(
-	.param .u64 Unknown84_param_0,
-	.param .u64 Unknown84_param_1,
-	.param .u64 Unknown84_param_2,
-	.param .u64 Unknown84_param_3,
-	.param .u64 Unknown84_param_4,
-	.param .u64 Unknown84_param_5,
-	.param .u64 Unknown84_param_6,
-	.param .u64 Unknown84_param_7,
-	.param .u64 Unknown84_param_8,
-	.param .u64 Unknown84_param_9,
-	.param .u64 Unknown84_param_10,
-	.param .u64 Unknown84_param_11,
-	.param .u64 Unknown84_param_12,
-	.param .u64 Unknown84_param_13,
-	.param .u64 Unknown84_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB16_2;
-	ld.param.u64 	%rd5, [Unknown84_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown84_param_1];
-	ld.param.u64 	%rd7, [Unknown84_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB16_2:
-	ret;
-
-}
-	// .globl	Unknown83
-.visible .entry Unknown83(
-	.param .u64 Unknown83_param_0,
-	.param .u64 Unknown83_param_1,
-	.param .u64 Unknown83_param_2,
-	.param .u64 Unknown83_param_3,
-	.param .u64 Unknown83_param_4,
-	.param .u64 Unknown83_param_5,
-	.param .u64 Unknown83_param_6,
-	.param .u64 Unknown83_param_7,
-	.param .u64 Unknown83_param_8,
-	.param .u64 Unknown83_param_9,
-	.param .u64 Unknown83_param_10,
-	.param .u64 Unknown83_param_11,
-	.param .u64 Unknown83_param_12,
-	.param .u64 Unknown83_param_13,
-	.param .u64 Unknown83_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB17_2;
-	ld.param.u64 	%rd5, [Unknown83_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown83_param_1];
-	ld.param.u64 	%rd7, [Unknown83_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB17_2:
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 511;
+	@%p1 bra 	$L__BB0_3;
+	ld.param.u64 	%rd12, [Unknown92_param_11];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown92_param_1];
+	ld.param.u64 	%rd14, [Unknown92_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 2;
+	shl.b64 	%rd7, %rd5, 2;
+$L__BB0_2:
+	add.s64 	%rd17, %rd2, %rd20;
+	ld.global.nc.f32 	%f1, [%rd17];
+	add.s64 	%rd18, %rd3, %rd20;
+	ld.global.nc.f32 	%f2, [%rd18];
+	mul.rn.f32 	%f3, %f1, 0f3F666666;
+	mul.rn.f32 	%f4, %f2, 0f3DCCCCCD;
+	add.rn.f32 	%f5, %f3, %f4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.f32 	[%rd19], %f5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 512;
+	@%p2 bra 	$L__BB0_2;
+$L__BB0_3:
 	ret;
 
 }
@@ -962,504 +92,44 @@ $L__BB17_2:
 	.param .u64 Unknown82_param_14
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB18_2;
-	ld.param.u64 	%rd5, [Unknown82_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown82_param_1];
-	ld.param.u64 	%rd7, [Unknown82_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB18_2:
-	ret;
-
-}
-	// .globl	Unknown81
-.visible .entry Unknown81(
-	.param .u64 Unknown81_param_0,
-	.param .u64 Unknown81_param_1,
-	.param .u64 Unknown81_param_2,
-	.param .u64 Unknown81_param_3,
-	.param .u64 Unknown81_param_4,
-	.param .u64 Unknown81_param_5,
-	.param .u64 Unknown81_param_6,
-	.param .u64 Unknown81_param_7,
-	.param .u64 Unknown81_param_8,
-	.param .u64 Unknown81_param_9,
-	.param .u64 Unknown81_param_10,
-	.param .u64 Unknown81_param_11,
-	.param .u64 Unknown81_param_12,
-	.param .u64 Unknown81_param_13,
-	.param .u64 Unknown81_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 255;
-	@%p1 bra 	$L__BB19_2;
-	ld.param.u64 	%rd5, [Unknown81_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown81_param_1];
-	ld.param.u64 	%rd7, [Unknown81_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB19_2:
-	ret;
-
-}
-	// .globl	Unknown80
-.visible .entry Unknown80(
-	.param .u64 Unknown80_param_0,
-	.param .u64 Unknown80_param_1,
-	.param .u64 Unknown80_param_2,
-	.param .u64 Unknown80_param_3,
-	.param .u64 Unknown80_param_4,
-	.param .u64 Unknown80_param_5,
-	.param .u64 Unknown80_param_6,
-	.param .u64 Unknown80_param_7,
-	.param .u64 Unknown80_param_8,
-	.param .u64 Unknown80_param_9,
-	.param .u64 Unknown80_param_10,
-	.param .u64 Unknown80_param_11,
-	.param .u64 Unknown80_param_12,
-	.param .u64 Unknown80_param_13,
-	.param .u64 Unknown80_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB20_2;
-	ld.param.u64 	%rd5, [Unknown80_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown80_param_1];
-	ld.param.u64 	%rd7, [Unknown80_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB20_2:
-	ret;
-
-}
-	// .globl	Unknown79
-.visible .entry Unknown79(
-	.param .u64 Unknown79_param_0,
-	.param .u64 Unknown79_param_1,
-	.param .u64 Unknown79_param_2,
-	.param .u64 Unknown79_param_3,
-	.param .u64 Unknown79_param_4,
-	.param .u64 Unknown79_param_5,
-	.param .u64 Unknown79_param_6,
-	.param .u64 Unknown79_param_7,
-	.param .u64 Unknown79_param_8,
-	.param .u64 Unknown79_param_9,
-	.param .u64 Unknown79_param_10,
-	.param .u64 Unknown79_param_11,
-	.param .u64 Unknown79_param_12,
-	.param .u64 Unknown79_param_13,
-	.param .u64 Unknown79_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB21_2;
-	ld.param.u64 	%rd5, [Unknown79_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown79_param_1];
-	ld.param.u64 	%rd7, [Unknown79_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB21_2:
-	ret;
-
-}
-	// .globl	Unknown78
-.visible .entry Unknown78(
-	.param .u64 Unknown78_param_0,
-	.param .u64 Unknown78_param_1,
-	.param .u64 Unknown78_param_2,
-	.param .u64 Unknown78_param_3,
-	.param .u64 Unknown78_param_4,
-	.param .u64 Unknown78_param_5,
-	.param .u64 Unknown78_param_6,
-	.param .u64 Unknown78_param_7,
-	.param .u64 Unknown78_param_8,
-	.param .u64 Unknown78_param_9,
-	.param .u64 Unknown78_param_10,
-	.param .u64 Unknown78_param_11,
-	.param .u64 Unknown78_param_12,
-	.param .u64 Unknown78_param_13,
-	.param .u64 Unknown78_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB22_2;
-	ld.param.u64 	%rd5, [Unknown78_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown78_param_1];
-	ld.param.u64 	%rd7, [Unknown78_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB22_2:
-	ret;
-
-}
-	// .globl	Unknown77
-.visible .entry Unknown77(
-	.param .u64 Unknown77_param_0,
-	.param .u64 Unknown77_param_1,
-	.param .u64 Unknown77_param_2,
-	.param .u64 Unknown77_param_3,
-	.param .u64 Unknown77_param_4,
-	.param .u64 Unknown77_param_5,
-	.param .u64 Unknown77_param_6,
-	.param .u64 Unknown77_param_7,
-	.param .u64 Unknown77_param_8,
-	.param .u64 Unknown77_param_9,
-	.param .u64 Unknown77_param_10,
-	.param .u64 Unknown77_param_11,
-	.param .u64 Unknown77_param_12,
-	.param .u64 Unknown77_param_13,
-	.param .u64 Unknown77_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB23_2;
-	ld.param.u64 	%rd5, [Unknown77_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown77_param_1];
-	ld.param.u64 	%rd7, [Unknown77_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB23_2:
-	ret;
-
-}
-	// .globl	Unknown76
-.visible .entry Unknown76(
-	.param .u64 Unknown76_param_0,
-	.param .u64 Unknown76_param_1,
-	.param .u64 Unknown76_param_2,
-	.param .u64 Unknown76_param_3,
-	.param .u64 Unknown76_param_4,
-	.param .u64 Unknown76_param_5,
-	.param .u64 Unknown76_param_6,
-	.param .u64 Unknown76_param_7,
-	.param .u64 Unknown76_param_8,
-	.param .u64 Unknown76_param_9,
-	.param .u64 Unknown76_param_10,
-	.param .u64 Unknown76_param_11,
-	.param .u64 Unknown76_param_12,
-	.param .u64 Unknown76_param_13,
-	.param .u64 Unknown76_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB24_2;
-	ld.param.u64 	%rd5, [Unknown76_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown76_param_1];
-	ld.param.u64 	%rd7, [Unknown76_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB24_2:
-	ret;
-
-}
-	// .globl	Unknown75
-.visible .entry Unknown75(
-	.param .u64 Unknown75_param_0,
-	.param .u64 Unknown75_param_1,
-	.param .u64 Unknown75_param_2,
-	.param .u64 Unknown75_param_3,
-	.param .u64 Unknown75_param_4,
-	.param .u64 Unknown75_param_5,
-	.param .u64 Unknown75_param_6,
-	.param .u64 Unknown75_param_7,
-	.param .u64 Unknown75_param_8,
-	.param .u64 Unknown75_param_9,
-	.param .u64 Unknown75_param_10,
-	.param .u64 Unknown75_param_11,
-	.param .u64 Unknown75_param_12,
-	.param .u64 Unknown75_param_13,
-	.param .u64 Unknown75_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB25_2;
-	ld.param.u64 	%rd5, [Unknown75_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown75_param_1];
-	ld.param.u64 	%rd7, [Unknown75_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB25_2:
-	ret;
-
-}
-	// .globl	Unknown74
-.visible .entry Unknown74(
-	.param .u64 Unknown74_param_0,
-	.param .u64 Unknown74_param_1,
-	.param .u64 Unknown74_param_2,
-	.param .u64 Unknown74_param_3,
-	.param .u64 Unknown74_param_4,
-	.param .u64 Unknown74_param_5,
-	.param .u64 Unknown74_param_6,
-	.param .u64 Unknown74_param_7,
-	.param .u64 Unknown74_param_8,
-	.param .u64 Unknown74_param_9,
-	.param .u64 Unknown74_param_10,
-	.param .u64 Unknown74_param_11,
-	.param .u64 Unknown74_param_12,
-	.param .u64 Unknown74_param_13,
-	.param .u64 Unknown74_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB26_2;
-	ld.param.u64 	%rd5, [Unknown74_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown74_param_1];
-	ld.param.u64 	%rd7, [Unknown74_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB26_2:
-	ret;
-
-}
-	// .globl	Unknown73
-.visible .entry Unknown73(
-	.param .u64 Unknown73_param_0,
-	.param .u64 Unknown73_param_1,
-	.param .u64 Unknown73_param_2,
-	.param .u64 Unknown73_param_3,
-	.param .u64 Unknown73_param_4,
-	.param .u64 Unknown73_param_5,
-	.param .u64 Unknown73_param_6,
-	.param .u64 Unknown73_param_7,
-	.param .u64 Unknown73_param_8,
-	.param .u64 Unknown73_param_9,
-	.param .u64 Unknown73_param_10,
-	.param .u64 Unknown73_param_11,
-	.param .u64 Unknown73_param_12,
-	.param .u64 Unknown73_param_13,
-	.param .u64 Unknown73_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB27_2;
-	ld.param.u64 	%rd5, [Unknown73_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown73_param_1];
-	ld.param.u64 	%rd7, [Unknown73_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB27_2:
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 255;
+	@%p1 bra 	$L__BB1_3;
+	ld.param.u64 	%rd12, [Unknown82_param_11];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown82_param_1];
+	ld.param.u64 	%rd14, [Unknown82_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 2;
+	shl.b64 	%rd7, %rd5, 2;
+$L__BB1_2:
+	add.s64 	%rd17, %rd2, %rd20;
+	ld.global.nc.f32 	%f1, [%rd17];
+	add.s64 	%rd18, %rd3, %rd20;
+	ld.global.nc.f32 	%f2, [%rd18];
+	mul.rn.f32 	%f3, %f1, 0f3F666666;
+	mul.rn.f32 	%f4, %f2, 0f3DCCCCCD;
+	add.rn.f32 	%f5, %f3, %f4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.f32 	[%rd19], %f5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 256;
+	@%p2 bra 	$L__BB1_2;
+$L__BB1_3:
 	ret;
 
 }
@@ -1482,504 +152,44 @@ $L__BB27_2:
 	.param .u64 Unknown72_param_14
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB28_2;
-	ld.param.u64 	%rd5, [Unknown72_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown72_param_1];
-	ld.param.u64 	%rd7, [Unknown72_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB28_2:
-	ret;
-
-}
-	// .globl	Unknown71
-.visible .entry Unknown71(
-	.param .u64 Unknown71_param_0,
-	.param .u64 Unknown71_param_1,
-	.param .u64 Unknown71_param_2,
-	.param .u64 Unknown71_param_3,
-	.param .u64 Unknown71_param_4,
-	.param .u64 Unknown71_param_5,
-	.param .u64 Unknown71_param_6,
-	.param .u64 Unknown71_param_7,
-	.param .u64 Unknown71_param_8,
-	.param .u64 Unknown71_param_9,
-	.param .u64 Unknown71_param_10,
-	.param .u64 Unknown71_param_11,
-	.param .u64 Unknown71_param_12,
-	.param .u64 Unknown71_param_13,
-	.param .u64 Unknown71_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 127;
-	@%p1 bra 	$L__BB29_2;
-	ld.param.u64 	%rd5, [Unknown71_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown71_param_1];
-	ld.param.u64 	%rd7, [Unknown71_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB29_2:
-	ret;
-
-}
-	// .globl	Unknown70
-.visible .entry Unknown70(
-	.param .u64 Unknown70_param_0,
-	.param .u64 Unknown70_param_1,
-	.param .u64 Unknown70_param_2,
-	.param .u64 Unknown70_param_3,
-	.param .u64 Unknown70_param_4,
-	.param .u64 Unknown70_param_5,
-	.param .u64 Unknown70_param_6,
-	.param .u64 Unknown70_param_7,
-	.param .u64 Unknown70_param_8,
-	.param .u64 Unknown70_param_9,
-	.param .u64 Unknown70_param_10,
-	.param .u64 Unknown70_param_11,
-	.param .u64 Unknown70_param_12,
-	.param .u64 Unknown70_param_13,
-	.param .u64 Unknown70_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB30_2;
-	ld.param.u64 	%rd5, [Unknown70_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown70_param_1];
-	ld.param.u64 	%rd7, [Unknown70_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB30_2:
-	ret;
-
-}
-	// .globl	Unknown69
-.visible .entry Unknown69(
-	.param .u64 Unknown69_param_0,
-	.param .u64 Unknown69_param_1,
-	.param .u64 Unknown69_param_2,
-	.param .u64 Unknown69_param_3,
-	.param .u64 Unknown69_param_4,
-	.param .u64 Unknown69_param_5,
-	.param .u64 Unknown69_param_6,
-	.param .u64 Unknown69_param_7,
-	.param .u64 Unknown69_param_8,
-	.param .u64 Unknown69_param_9,
-	.param .u64 Unknown69_param_10,
-	.param .u64 Unknown69_param_11,
-	.param .u64 Unknown69_param_12,
-	.param .u64 Unknown69_param_13,
-	.param .u64 Unknown69_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB31_2;
-	ld.param.u64 	%rd5, [Unknown69_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown69_param_1];
-	ld.param.u64 	%rd7, [Unknown69_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB31_2:
-	ret;
-
-}
-	// .globl	Unknown68
-.visible .entry Unknown68(
-	.param .u64 Unknown68_param_0,
-	.param .u64 Unknown68_param_1,
-	.param .u64 Unknown68_param_2,
-	.param .u64 Unknown68_param_3,
-	.param .u64 Unknown68_param_4,
-	.param .u64 Unknown68_param_5,
-	.param .u64 Unknown68_param_6,
-	.param .u64 Unknown68_param_7,
-	.param .u64 Unknown68_param_8,
-	.param .u64 Unknown68_param_9,
-	.param .u64 Unknown68_param_10,
-	.param .u64 Unknown68_param_11,
-	.param .u64 Unknown68_param_12,
-	.param .u64 Unknown68_param_13,
-	.param .u64 Unknown68_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB32_2;
-	ld.param.u64 	%rd5, [Unknown68_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown68_param_1];
-	ld.param.u64 	%rd7, [Unknown68_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB32_2:
-	ret;
-
-}
-	// .globl	Unknown67
-.visible .entry Unknown67(
-	.param .u64 Unknown67_param_0,
-	.param .u64 Unknown67_param_1,
-	.param .u64 Unknown67_param_2,
-	.param .u64 Unknown67_param_3,
-	.param .u64 Unknown67_param_4,
-	.param .u64 Unknown67_param_5,
-	.param .u64 Unknown67_param_6,
-	.param .u64 Unknown67_param_7,
-	.param .u64 Unknown67_param_8,
-	.param .u64 Unknown67_param_9,
-	.param .u64 Unknown67_param_10,
-	.param .u64 Unknown67_param_11,
-	.param .u64 Unknown67_param_12,
-	.param .u64 Unknown67_param_13,
-	.param .u64 Unknown67_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB33_2;
-	ld.param.u64 	%rd5, [Unknown67_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown67_param_1];
-	ld.param.u64 	%rd7, [Unknown67_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB33_2:
-	ret;
-
-}
-	// .globl	Unknown66
-.visible .entry Unknown66(
-	.param .u64 Unknown66_param_0,
-	.param .u64 Unknown66_param_1,
-	.param .u64 Unknown66_param_2,
-	.param .u64 Unknown66_param_3,
-	.param .u64 Unknown66_param_4,
-	.param .u64 Unknown66_param_5,
-	.param .u64 Unknown66_param_6,
-	.param .u64 Unknown66_param_7,
-	.param .u64 Unknown66_param_8,
-	.param .u64 Unknown66_param_9,
-	.param .u64 Unknown66_param_10,
-	.param .u64 Unknown66_param_11,
-	.param .u64 Unknown66_param_12,
-	.param .u64 Unknown66_param_13,
-	.param .u64 Unknown66_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB34_2;
-	ld.param.u64 	%rd5, [Unknown66_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown66_param_1];
-	ld.param.u64 	%rd7, [Unknown66_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB34_2:
-	ret;
-
-}
-	// .globl	Unknown65
-.visible .entry Unknown65(
-	.param .u64 Unknown65_param_0,
-	.param .u64 Unknown65_param_1,
-	.param .u64 Unknown65_param_2,
-	.param .u64 Unknown65_param_3,
-	.param .u64 Unknown65_param_4,
-	.param .u64 Unknown65_param_5,
-	.param .u64 Unknown65_param_6,
-	.param .u64 Unknown65_param_7,
-	.param .u64 Unknown65_param_8,
-	.param .u64 Unknown65_param_9,
-	.param .u64 Unknown65_param_10,
-	.param .u64 Unknown65_param_11,
-	.param .u64 Unknown65_param_12,
-	.param .u64 Unknown65_param_13,
-	.param .u64 Unknown65_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB35_2;
-	ld.param.u64 	%rd5, [Unknown65_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown65_param_1];
-	ld.param.u64 	%rd7, [Unknown65_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB35_2:
-	ret;
-
-}
-	// .globl	Unknown64
-.visible .entry Unknown64(
-	.param .u64 Unknown64_param_0,
-	.param .u64 Unknown64_param_1,
-	.param .u64 Unknown64_param_2,
-	.param .u64 Unknown64_param_3,
-	.param .u64 Unknown64_param_4,
-	.param .u64 Unknown64_param_5,
-	.param .u64 Unknown64_param_6,
-	.param .u64 Unknown64_param_7,
-	.param .u64 Unknown64_param_8,
-	.param .u64 Unknown64_param_9,
-	.param .u64 Unknown64_param_10,
-	.param .u64 Unknown64_param_11,
-	.param .u64 Unknown64_param_12,
-	.param .u64 Unknown64_param_13,
-	.param .u64 Unknown64_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB36_2;
-	ld.param.u64 	%rd5, [Unknown64_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown64_param_1];
-	ld.param.u64 	%rd7, [Unknown64_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB36_2:
-	ret;
-
-}
-	// .globl	Unknown63
-.visible .entry Unknown63(
-	.param .u64 Unknown63_param_0,
-	.param .u64 Unknown63_param_1,
-	.param .u64 Unknown63_param_2,
-	.param .u64 Unknown63_param_3,
-	.param .u64 Unknown63_param_4,
-	.param .u64 Unknown63_param_5,
-	.param .u64 Unknown63_param_6,
-	.param .u64 Unknown63_param_7,
-	.param .u64 Unknown63_param_8,
-	.param .u64 Unknown63_param_9,
-	.param .u64 Unknown63_param_10,
-	.param .u64 Unknown63_param_11,
-	.param .u64 Unknown63_param_12,
-	.param .u64 Unknown63_param_13,
-	.param .u64 Unknown63_param_14
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB37_2;
-	ld.param.u64 	%rd5, [Unknown63_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown63_param_1];
-	ld.param.u64 	%rd7, [Unknown63_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB37_2:
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 127;
+	@%p1 bra 	$L__BB2_3;
+	ld.param.u64 	%rd12, [Unknown72_param_11];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown72_param_1];
+	ld.param.u64 	%rd14, [Unknown72_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 2;
+	shl.b64 	%rd7, %rd5, 2;
+$L__BB2_2:
+	add.s64 	%rd17, %rd2, %rd20;
+	ld.global.nc.f32 	%f1, [%rd17];
+	add.s64 	%rd18, %rd3, %rd20;
+	ld.global.nc.f32 	%f2, [%rd18];
+	mul.rn.f32 	%f3, %f1, 0f3F666666;
+	mul.rn.f32 	%f4, %f2, 0f3DCCCCCD;
+	add.rn.f32 	%f5, %f3, %f4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.f32 	[%rd19], %f5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 128;
+	@%p2 bra 	$L__BB2_2;
+$L__BB2_3:
 	ret;
 
 }
@@ -2002,36 +212,44 @@ $L__BB37_2:
 	.param .u64 Unknown62_param_14
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB38_2;
-	ld.param.u64 	%rd5, [Unknown62_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown62_param_1];
-	ld.param.u64 	%rd7, [Unknown62_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB38_2:
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 63;
+	@%p1 bra 	$L__BB3_3;
+	ld.param.u64 	%rd12, [Unknown62_param_11];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown62_param_1];
+	ld.param.u64 	%rd14, [Unknown62_param_6];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 2;
+	shl.b64 	%rd7, %rd5, 2;
+$L__BB3_2:
+	add.s64 	%rd17, %rd2, %rd20;
+	ld.global.nc.f32 	%f1, [%rd17];
+	add.s64 	%rd18, %rd3, %rd20;
+	ld.global.nc.f32 	%f2, [%rd18];
+	mul.rn.f32 	%f3, %f1, 0f3F666666;
+	mul.rn.f32 	%f4, %f2, 0f3DCCCCCD;
+	add.rn.f32 	%f5, %f3, %f4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.f32 	[%rd19], %f5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 64;
+	@%p2 bra 	$L__BB3_2;
+$L__BB3_3:
 	ret;
 
 }
@@ -2051,39 +269,54 @@ $L__BB38_2:
 	.param .u64 Unknown61_param_11,
 	.param .u64 Unknown61_param_12,
 	.param .u64 Unknown61_param_13,
-	.param .u64 Unknown61_param_14
+	.param .u64 Unknown61_param_14,
+	.param .u64 Unknown61_param_15,
+	.param .u64 Unknown61_param_16,
+	.param .u64 Unknown61_param_17,
+	.param .u64 Unknown61_param_18
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<6>;
-	.reg .b64 	%rd<14>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .f32 	%f<2>;
+	.reg .b64 	%rd<27>;
 
 	mov.u32 	%r1, %ctaid.x;
 	mov.u32 	%r2, %ntid.x;
 	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 63;
-	@%p1 bra 	$L__BB39_2;
-	ld.param.u64 	%rd5, [Unknown61_param_11];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown61_param_1];
-	ld.param.u64 	%rd7, [Unknown61_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 2;
-	add.s64 	%rd11, %rd3, %rd10;
-	ld.global.f32 	%f1, [%rd11];
-	add.s64 	%rd12, %rd2, %rd10;
-	ld.global.f32 	%f2, [%rd12];
-	mul.rn.f32 	%f3, %f2, 0f3F666666;
-	mul.rn.f32 	%f4, %f1, 0f3DCCCCCD;
-	add.rn.f32 	%f5, %f4, %f3;
-	add.s64 	%rd13, %rd1, %rd10;
-	st.global.f32 	[%rd13], %f5;
-$L__BB39_2:
+	cvt.s64.s32 	%rd19, %r3;
+	mul.wide.s32 	%rd20, %r2, %r1;
+	add.s64 	%rd26, %rd20, %rd19;
+	setp.gt.s64 	%p1, %rd26, 999;
+	@%p1 bra 	$L__BB4_3;
+	ld.param.u64 	%rd16, [Unknown61_param_13];
+	cvta.to.global.u64 	%rd1, %rd16;
+	ld.param.u64 	%rd17, [Unknown61_param_1];
+	ld.param.u64 	%rd18, [Unknown61_param_6];
+	cvta.to.global.u64 	%rd2, %rd18;
+	cvta.to.global.u64 	%rd3, %rd17;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd21, %rd26, 2;
+	add.s64 	%rd25, %rd3, %rd21;
+	shl.b64 	%rd7, %rd5, 2;
+	shl.b64 	%rd24, %rd26, 1;
+	shl.b64 	%rd9, %rd5, 1;
+$L__BB4_2:
+	ld.global.nc.f32 	%f1, [%rd25];
+	add.s64 	%rd22, %rd2, %rd24;
+	ld.global.nc.u16 	%rs1, [%rd22];
+	cvt.rn.f16.f32 	%rs2, %f1;
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd23, %rd1, %rd24;
+	st.global.b16 	[%rd23], %rs3;
+	add.s64 	%rd26, %rd26, %rd5;
+	add.s64 	%rd25, %rd25, %rd7;
+	add.s64 	%rd24, %rd24, %rd9;
+	setp.lt.s64 	%p2, %rd26, 1000;
+	@%p2 bra 	$L__BB4_2;
+$L__BB4_3:
 	ret;
 
 }
@@ -2102,45 +335,45 @@ $L__BB39_2:
 	.param .u64 Unknown60_param_10,
 	.param .u64 Unknown60_param_11,
 	.param .u64 Unknown60_param_12,
-	.param .u64 Unknown60_param_13,
-	.param .u64 Unknown60_param_14,
-	.param .u64 Unknown60_param_15,
-	.param .u64 Unknown60_param_16,
-	.param .u64 Unknown60_param_17,
-	.param .u64 Unknown60_param_18
+	.param .u64 Unknown60_param_13
 )
 {
-	.reg .pred 	%p<2>;
-	.reg .b16 	%h<4>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<15>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 999;
-	@%p1 bra 	$L__BB40_2;
-	ld.param.u64 	%rd5, [Unknown60_param_13];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown60_param_1];
-	ld.param.u64 	%rd7, [Unknown60_param_6];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	shl.b64 	%rd10, %rd4, 1;
-	add.s64 	%rd11, %rd2, %rd10;
-	ld.global.b16 	%h1, [%rd11];
-	shl.b64 	%rd12, %rd4, 2;
-	add.s64 	%rd13, %rd3, %rd12;
-	ld.global.f32 	%f1, [%rd13];
-	cvt.rn.f16.f32 	%h2, %f1;
-	add.rn.f16 	%h3, %h1, %h2;
-	add.s64 	%rd14, %rd1, %rd10;
-	st.global.b16 	[%rd14], %h3;
-$L__BB40_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 511999;
+	@%p1 bra 	$L__BB5_3;
+	ld.param.u64 	%rd15, [Unknown60_param_8];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown60_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB5_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 512000;
+	@%p2 bra 	$L__BB5_2;
+$L__BB5_3:
 	ret;
 
 }
@@ -2160,502 +393,41 @@ $L__BB40_2:
 	.param .u64 Unknown59_param_11,
 	.param .u64 Unknown59_param_12,
 	.param .u64 Unknown59_param_13
-)
-{
-	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 511999;
-	@%p1 bra 	$L__BB41_2;
-	ld.param.u64 	%rd4, [Unknown59_param_8];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown59_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 55;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -512;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 512;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 55;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 9;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 9;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 2;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f32 	%f1, [%rd24];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd25, %rd22, 1;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.b16 	[%rd26], %h1;
-$L__BB41_2:
-	ret;
-
-}
-	// .globl	Unknown58
-.visible .entry Unknown58(
-	.param .u64 Unknown58_param_0,
-	.param .u64 Unknown58_param_1,
-	.param .u64 Unknown58_param_2,
-	.param .u64 Unknown58_param_3,
-	.param .u64 Unknown58_param_4,
-	.param .u64 Unknown58_param_5,
-	.param .u64 Unknown58_param_6,
-	.param .u64 Unknown58_param_7,
-	.param .u64 Unknown58_param_8,
-	.param .u64 Unknown58_param_9,
-	.param .u64 Unknown58_param_10,
-	.param .u64 Unknown58_param_11,
-	.param .u64 Unknown58_param_12,
-	.param .u64 Unknown58_param_13
-)
-{
-	.reg .pred 	%p<2>;
-	.reg .b16 	%h<4>;
-	.reg .b32 	%r<4>;
-	.reg .b64 	%rd<11>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd7, %r3;
-	mul.wide.s32 	%rd8, %r2, %r1;
-	add.s64 	%rd9, %rd8, %rd7;
-	setp.gt.s64 	%p1, %rd9, 511;
-	@%p1 bra 	$L__BB42_2;
-	ld.param.u64 	%rd3, [Unknown58_param_8];
-	cvta.to.global.u64 	%rd4, %rd3;
-	ld.param.u64 	%rd5, [Unknown58_param_1];
-	cvta.to.global.u64 	%rd6, %rd5;
-	shl.b64 	%rd10, %rd9, 1;
-	add.s64 	%rd1, %rd6, %rd10;
-	add.s64 	%rd2, %rd4, %rd10;
-	ld.global.b16 	%h1, [%rd1];
-	mov.b16 	%h2, 0x2539;
-	mul.rn.f16 	%h3, %h1, %h2;
-	st.global.b16 	[%rd2], %h3;
-$L__BB42_2:
-	ret;
-
-}
-	// .globl	Unknown57
-.visible .entry Unknown57(
-	.param .u64 Unknown57_param_0,
-	.param .u64 Unknown57_param_1,
-	.param .u64 Unknown57_param_2,
-	.param .u64 Unknown57_param_3,
-	.param .u64 Unknown57_param_4,
-	.param .u64 Unknown57_param_5,
-	.param .u64 Unknown57_param_6,
-	.param .u64 Unknown57_param_7,
-	.param .u64 Unknown57_param_8,
-	.param .u64 Unknown57_param_9,
-	.param .u64 Unknown57_param_10,
-	.param .u64 Unknown57_param_11,
-	.param .u64 Unknown57_param_12,
-	.param .u64 Unknown57_param_13,
-	.param .u64 Unknown57_param_14,
-	.param .u64 Unknown57_param_15,
-	.param .u64 Unknown57_param_16,
-	.param .u64 Unknown57_param_17,
-	.param .u64 Unknown57_param_18,
-	.param .u64 Unknown57_param_19,
-	.param .u64 Unknown57_param_20,
-	.param .u64 Unknown57_param_21,
-	.param .u64 Unknown57_param_22,
-	.param .u64 Unknown57_param_23,
-	.param .u64 Unknown57_param_24,
-	.param .u64 Unknown57_param_25,
-	.param .u64 Unknown57_param_26,
-	.param .u64 Unknown57_param_27,
-	.param .u64 Unknown57_param_28,
-	.param .u64 Unknown57_param_29,
-	.param .u64 Unknown57_param_30,
-	.param .u64 Unknown57_param_31,
-	.param .u64 Unknown57_param_32
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 25087;
-	@%p1 bra 	$L__BB43_2;
-	ld.param.u64 	%rd5, [Unknown57_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown57_param_1];
-	ld.param.u64 	%rd7, [Unknown57_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 1;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 7;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 7;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 1;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 1;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 7;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 7;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.u64 	%rd37, %rd35, 1;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 49;
-	mul.lo.s64 	%rd41, %rd32, 7;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB43_2:
-	ret;
-
-}
-	// .globl	Unknown55
-.visible .entry Unknown55(
-	.param .u64 Unknown55_param_0,
-	.param .u64 Unknown55_param_1,
-	.param .u64 Unknown55_param_2,
-	.param .u64 Unknown55_param_3,
-	.param .u64 Unknown55_param_4,
-	.param .u64 Unknown55_param_5,
-	.param .u64 Unknown55_param_6,
-	.param .u64 Unknown55_param_7,
-	.param .u64 Unknown55_param_8,
-	.param .u64 Unknown55_param_9,
-	.param .u64 Unknown55_param_10,
-	.param .u64 Unknown55_param_11,
-	.param .u64 Unknown55_param_12,
-	.param .u64 Unknown55_param_13,
-	.param .u64 Unknown55_param_14,
-	.param .u64 Unknown55_param_15,
-	.param .u64 Unknown55_param_16,
-	.param .u64 Unknown55_param_17,
-	.param .u64 Unknown55_param_18,
-	.param .u64 Unknown55_param_19,
-	.param .u64 Unknown55_param_20,
-	.param .u64 Unknown55_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB44_2;
-	ld.param.u64 	%rd4, [Unknown55_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown55_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB44_2:
-	ret;
-
-}
-	// .globl	Unknown54
-.visible .entry Unknown54(
-	.param .u64 Unknown54_param_0,
-	.param .u64 Unknown54_param_1,
-	.param .u64 Unknown54_param_2,
-	.param .u64 Unknown54_param_3,
-	.param .u64 Unknown54_param_4,
-	.param .u64 Unknown54_param_5,
-	.param .u64 Unknown54_param_6,
-	.param .u64 Unknown54_param_7,
-	.param .u64 Unknown54_param_8,
-	.param .u64 Unknown54_param_9,
-	.param .u64 Unknown54_param_10,
-	.param .u64 Unknown54_param_11,
-	.param .u64 Unknown54_param_12,
-	.param .u64 Unknown54_param_13,
-	.param .u64 Unknown54_param_14,
-	.param .u64 Unknown54_param_15,
-	.param .u64 Unknown54_param_16,
-	.param .u64 Unknown54_param_17,
-	.param .u64 Unknown54_param_18,
-	.param .u64 Unknown54_param_19,
-	.param .u64 Unknown54_param_20,
-	.param .u64 Unknown54_param_21
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 25087;
-	@%p1 bra 	$L__BB45_2;
-	ld.param.u64 	%rd4, [Unknown54_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown54_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 1;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 7;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 7;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 1;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 1;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 7;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 7;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd33, 1;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 49;
-	mul.lo.s64 	%rd39, %rd30, 7;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB45_2:
-	ret;
-
-}
-	// .globl	Unknown52
-.visible .entry Unknown52(
-	.param .u64 Unknown52_param_0,
-	.param .u64 Unknown52_param_1,
-	.param .u64 Unknown52_param_2,
-	.param .u64 Unknown52_param_3,
-	.param .u64 Unknown52_param_4,
-	.param .u64 Unknown52_param_5,
-	.param .u64 Unknown52_param_6,
-	.param .u64 Unknown52_param_7,
-	.param .u64 Unknown52_param_8,
-	.param .u64 Unknown52_param_9,
-	.param .u64 Unknown52_param_10,
-	.param .u64 Unknown52_param_11,
-	.param .u64 Unknown52_param_12,
-	.param .u64 Unknown52_param_13,
-	.param .u64 Unknown52_param_14,
-	.param .u64 Unknown52_param_15,
-	.param .u64 Unknown52_param_16,
-	.param .u64 Unknown52_param_17,
-	.param .u64 Unknown52_param_18,
-	.param .u64 Unknown52_param_19,
-	.param .u64 Unknown52_param_20,
-	.param .u64 Unknown52_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB46_2;
-	ld.param.u64 	%rd4, [Unknown52_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown52_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB46_2:
+)
+{
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 511;
+	@%p1 bra 	$L__BB6_3;
+	ld.param.u64 	%rd11, [Unknown59_param_8];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown59_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB6_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x2539;
+	mul.rn.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 512;
+	@%p2 bra 	$L__BB6_2;
+$L__BB6_3:
 	ret;
 
 }
@@ -2696,74 +468,44 @@ $L__BB46_2:
 	.param .u64 Unknown51_param_32
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 25087;
-	@%p1 bra 	$L__BB47_2;
-	ld.param.u64 	%rd5, [Unknown51_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown51_param_1];
-	ld.param.u64 	%rd7, [Unknown51_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 1;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 7;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 7;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 1;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 1;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 7;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 7;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.u64 	%rd37, %rd35, 1;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 49;
-	mul.lo.s64 	%rd41, %rd32, 7;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB47_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<6>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 25087;
+	@%p1 bra 	$L__BB7_3;
+	ld.param.u64 	%rd12, [Unknown51_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown51_param_1];
+	ld.param.u64 	%rd14, [Unknown51_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB7_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	mov.b16 	%rs4, 0x0000;
+	max.NaN.f16 	%rs5, %rs3, %rs4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 25088;
+	@%p2 bra 	$L__BB7_2;
+$L__BB7_3:
 	ret;
 
 }
@@ -2793,80 +535,42 @@ $L__BB47_2:
 	.param .u64 Unknown49_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 2359295;
-	@%p1 bra 	$L__BB48_2;
-	ld.param.u64 	%rd4, [Unknown49_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown49_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 55;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -512;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 512;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 55;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 9;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 4608;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB48_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 2359295;
+	@%p1 bra 	$L__BB8_3;
+	ld.param.u64 	%rd15, [Unknown49_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown49_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB8_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 2359296;
+	@%p2 bra 	$L__BB8_2;
+$L__BB8_3:
 	ret;
 
 }
@@ -2896,69 +600,39 @@ $L__BB48_2:
 	.param .u64 Unknown48_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 25087;
-	@%p1 bra 	$L__BB49_2;
-	ld.param.u64 	%rd4, [Unknown48_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown48_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 1;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 7;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 7;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 1;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 1;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 7;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 7;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd33, 1;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 49;
-	mul.lo.s64 	%rd39, %rd30, 7;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB49_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 25087;
+	@%p1 bra 	$L__BB9_3;
+	ld.param.u64 	%rd11, [Unknown48_param_12];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown48_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB9_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x0000;
+	max.NaN.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 25088;
+	@%p2 bra 	$L__BB9_2;
+$L__BB9_3:
 	ret;
 
 }
@@ -2988,80 +662,42 @@ $L__BB49_2:
 	.param .u64 Unknown46_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 1179647;
-	@%p1 bra 	$L__BB50_2;
-	ld.param.u64 	%rd4, [Unknown46_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown46_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB50_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 1179647;
+	@%p1 bra 	$L__BB10_3;
+	ld.param.u64 	%rd15, [Unknown46_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown46_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB10_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 1179648;
+	@%p2 bra 	$L__BB10_2;
+$L__BB10_3:
 	ret;
 
 }
@@ -3092,453 +728,41 @@ $L__BB50_2:
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 131071;
-	@%p1 bra 	$L__BB51_2;
-	ld.param.u64 	%rd4, [Unknown44_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown44_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 56;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -256;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 256;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 56;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 8;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 8;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 2;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f32 	%f1, [%rd24];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd25, %rd22, 1;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.b16 	[%rd26], %h1;
-$L__BB51_2:
-	ret;
-
-}
-	// .globl	Unknown43
-.visible .entry Unknown43(
-	.param .u64 Unknown43_param_0,
-	.param .u64 Unknown43_param_1,
-	.param .u64 Unknown43_param_2,
-	.param .u64 Unknown43_param_3,
-	.param .u64 Unknown43_param_4,
-	.param .u64 Unknown43_param_5,
-	.param .u64 Unknown43_param_6,
-	.param .u64 Unknown43_param_7,
-	.param .u64 Unknown43_param_8,
-	.param .u64 Unknown43_param_9,
-	.param .u64 Unknown43_param_10,
-	.param .u64 Unknown43_param_11,
-	.param .u64 Unknown43_param_12,
-	.param .u64 Unknown43_param_13,
-	.param .u64 Unknown43_param_14,
-	.param .u64 Unknown43_param_15,
-	.param .u64 Unknown43_param_16,
-	.param .u64 Unknown43_param_17,
-	.param .u64 Unknown43_param_18,
-	.param .u64 Unknown43_param_19,
-	.param .u64 Unknown43_param_20,
-	.param .u64 Unknown43_param_21,
-	.param .u64 Unknown43_param_22,
-	.param .u64 Unknown43_param_23,
-	.param .u64 Unknown43_param_24,
-	.param .u64 Unknown43_param_25,
-	.param .u64 Unknown43_param_26,
-	.param .u64 Unknown43_param_27,
-	.param .u64 Unknown43_param_28,
-	.param .u64 Unknown43_param_29,
-	.param .u64 Unknown43_param_30,
-	.param .u64 Unknown43_param_31,
-	.param .u64 Unknown43_param_32
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 50175;
-	@%p1 bra 	$L__BB52_2;
-	ld.param.u64 	%rd5, [Unknown43_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown43_param_1];
-	ld.param.u64 	%rd7, [Unknown43_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 2;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 14;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 14;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 2;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 2;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 14;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 14;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 2;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 196;
-	mul.lo.s64 	%rd41, %rd32, 14;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB52_2:
-	ret;
-
-}
-	// .globl	Unknown41
-.visible .entry Unknown41(
-	.param .u64 Unknown41_param_0,
-	.param .u64 Unknown41_param_1,
-	.param .u64 Unknown41_param_2,
-	.param .u64 Unknown41_param_3,
-	.param .u64 Unknown41_param_4,
-	.param .u64 Unknown41_param_5,
-	.param .u64 Unknown41_param_6,
-	.param .u64 Unknown41_param_7,
-	.param .u64 Unknown41_param_8,
-	.param .u64 Unknown41_param_9,
-	.param .u64 Unknown41_param_10,
-	.param .u64 Unknown41_param_11,
-	.param .u64 Unknown41_param_12,
-	.param .u64 Unknown41_param_13,
-	.param .u64 Unknown41_param_14,
-	.param .u64 Unknown41_param_15,
-	.param .u64 Unknown41_param_16,
-	.param .u64 Unknown41_param_17,
-	.param .u64 Unknown41_param_18,
-	.param .u64 Unknown41_param_19,
-	.param .u64 Unknown41_param_20,
-	.param .u64 Unknown41_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB53_2;
-	ld.param.u64 	%rd4, [Unknown41_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown41_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB53_2:
-	ret;
-
-}
-	// .globl	Unknown40
-.visible .entry Unknown40(
-	.param .u64 Unknown40_param_0,
-	.param .u64 Unknown40_param_1,
-	.param .u64 Unknown40_param_2,
-	.param .u64 Unknown40_param_3,
-	.param .u64 Unknown40_param_4,
-	.param .u64 Unknown40_param_5,
-	.param .u64 Unknown40_param_6,
-	.param .u64 Unknown40_param_7,
-	.param .u64 Unknown40_param_8,
-	.param .u64 Unknown40_param_9,
-	.param .u64 Unknown40_param_10,
-	.param .u64 Unknown40_param_11,
-	.param .u64 Unknown40_param_12,
-	.param .u64 Unknown40_param_13,
-	.param .u64 Unknown40_param_14,
-	.param .u64 Unknown40_param_15,
-	.param .u64 Unknown40_param_16,
-	.param .u64 Unknown40_param_17,
-	.param .u64 Unknown40_param_18,
-	.param .u64 Unknown40_param_19,
-	.param .u64 Unknown40_param_20,
-	.param .u64 Unknown40_param_21
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 50175;
-	@%p1 bra 	$L__BB54_2;
-	ld.param.u64 	%rd4, [Unknown40_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown40_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 2;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 14;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 14;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 2;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 2;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 14;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 14;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 2;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 196;
-	mul.lo.s64 	%rd39, %rd30, 14;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB54_2:
-	ret;
-
-}
-	// .globl	Unknown38
-.visible .entry Unknown38(
-	.param .u64 Unknown38_param_0,
-	.param .u64 Unknown38_param_1,
-	.param .u64 Unknown38_param_2,
-	.param .u64 Unknown38_param_3,
-	.param .u64 Unknown38_param_4,
-	.param .u64 Unknown38_param_5,
-	.param .u64 Unknown38_param_6,
-	.param .u64 Unknown38_param_7,
-	.param .u64 Unknown38_param_8,
-	.param .u64 Unknown38_param_9,
-	.param .u64 Unknown38_param_10,
-	.param .u64 Unknown38_param_11,
-	.param .u64 Unknown38_param_12,
-	.param .u64 Unknown38_param_13,
-	.param .u64 Unknown38_param_14,
-	.param .u64 Unknown38_param_15,
-	.param .u64 Unknown38_param_16,
-	.param .u64 Unknown38_param_17,
-	.param .u64 Unknown38_param_18,
-	.param .u64 Unknown38_param_19,
-	.param .u64 Unknown38_param_20,
-	.param .u64 Unknown38_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB55_2;
-	ld.param.u64 	%rd4, [Unknown38_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown38_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB55_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 131071;
+	@%p1 bra 	$L__BB11_3;
+	ld.param.u64 	%rd15, [Unknown44_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown44_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB11_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 131072;
+	@%p2 bra 	$L__BB11_2;
+$L__BB11_3:
 	ret;
 
 }
@@ -3579,74 +803,44 @@ $L__BB55_2:
 	.param .u64 Unknown37_param_32
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 50175;
-	@%p1 bra 	$L__BB56_2;
-	ld.param.u64 	%rd5, [Unknown37_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown37_param_1];
-	ld.param.u64 	%rd7, [Unknown37_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 2;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 14;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 14;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 2;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 2;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 14;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 14;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 2;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 196;
-	mul.lo.s64 	%rd41, %rd32, 14;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB56_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<6>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 50175;
+	@%p1 bra 	$L__BB12_3;
+	ld.param.u64 	%rd12, [Unknown37_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown37_param_1];
+	ld.param.u64 	%rd14, [Unknown37_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB12_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	mov.b16 	%rs4, 0x0000;
+	max.NaN.f16 	%rs5, %rs3, %rs4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 50176;
+	@%p2 bra 	$L__BB12_2;
+$L__BB12_3:
 	ret;
 
 }
@@ -3676,80 +870,42 @@ $L__BB56_2:
 	.param .u64 Unknown35_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 589823;
-	@%p1 bra 	$L__BB57_2;
-	ld.param.u64 	%rd4, [Unknown35_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown35_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 56;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -256;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 256;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 56;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 8;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 2304;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB57_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 589823;
+	@%p1 bra 	$L__BB13_3;
+	ld.param.u64 	%rd15, [Unknown35_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown35_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB13_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 589824;
+	@%p2 bra 	$L__BB13_2;
+$L__BB13_3:
 	ret;
 
 }
@@ -3779,69 +935,39 @@ $L__BB57_2:
 	.param .u64 Unknown34_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 50175;
-	@%p1 bra 	$L__BB58_2;
-	ld.param.u64 	%rd4, [Unknown34_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown34_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 2;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 14;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 14;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 2;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 2;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 14;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 14;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 2;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 196;
-	mul.lo.s64 	%rd39, %rd30, 14;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB58_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 50175;
+	@%p1 bra 	$L__BB14_3;
+	ld.param.u64 	%rd11, [Unknown34_param_12];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown34_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB14_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x0000;
+	max.NaN.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 50176;
+	@%p2 bra 	$L__BB14_2;
+$L__BB14_3:
 	ret;
 
 }
@@ -3871,80 +997,42 @@ $L__BB58_2:
 	.param .u64 Unknown32_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 294911;
-	@%p1 bra 	$L__BB59_2;
-	ld.param.u64 	%rd4, [Unknown32_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown32_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB59_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 294911;
+	@%p1 bra 	$L__BB15_3;
+	ld.param.u64 	%rd15, [Unknown32_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown32_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB15_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 294912;
+	@%p2 bra 	$L__BB15_2;
+$L__BB15_3:
 	ret;
 
 }
@@ -3975,453 +1063,41 @@ $L__BB59_2:
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 32767;
-	@%p1 bra 	$L__BB60_2;
-	ld.param.u64 	%rd4, [Unknown30_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown30_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 57;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -128;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 128;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 57;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 7;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 7;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 2;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f32 	%f1, [%rd24];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd25, %rd22, 1;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.b16 	[%rd26], %h1;
-$L__BB60_2:
-	ret;
-
-}
-	// .globl	Unknown29
-.visible .entry Unknown29(
-	.param .u64 Unknown29_param_0,
-	.param .u64 Unknown29_param_1,
-	.param .u64 Unknown29_param_2,
-	.param .u64 Unknown29_param_3,
-	.param .u64 Unknown29_param_4,
-	.param .u64 Unknown29_param_5,
-	.param .u64 Unknown29_param_6,
-	.param .u64 Unknown29_param_7,
-	.param .u64 Unknown29_param_8,
-	.param .u64 Unknown29_param_9,
-	.param .u64 Unknown29_param_10,
-	.param .u64 Unknown29_param_11,
-	.param .u64 Unknown29_param_12,
-	.param .u64 Unknown29_param_13,
-	.param .u64 Unknown29_param_14,
-	.param .u64 Unknown29_param_15,
-	.param .u64 Unknown29_param_16,
-	.param .u64 Unknown29_param_17,
-	.param .u64 Unknown29_param_18,
-	.param .u64 Unknown29_param_19,
-	.param .u64 Unknown29_param_20,
-	.param .u64 Unknown29_param_21,
-	.param .u64 Unknown29_param_22,
-	.param .u64 Unknown29_param_23,
-	.param .u64 Unknown29_param_24,
-	.param .u64 Unknown29_param_25,
-	.param .u64 Unknown29_param_26,
-	.param .u64 Unknown29_param_27,
-	.param .u64 Unknown29_param_28,
-	.param .u64 Unknown29_param_29,
-	.param .u64 Unknown29_param_30,
-	.param .u64 Unknown29_param_31,
-	.param .u64 Unknown29_param_32
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 100351;
-	@%p1 bra 	$L__BB61_2;
-	ld.param.u64 	%rd5, [Unknown29_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown29_param_1];
-	ld.param.u64 	%rd7, [Unknown29_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 28;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 28;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 3;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 28;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 28;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 3;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 784;
-	mul.lo.s64 	%rd41, %rd32, 28;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB61_2:
-	ret;
-
-}
-	// .globl	Unknown27
-.visible .entry Unknown27(
-	.param .u64 Unknown27_param_0,
-	.param .u64 Unknown27_param_1,
-	.param .u64 Unknown27_param_2,
-	.param .u64 Unknown27_param_3,
-	.param .u64 Unknown27_param_4,
-	.param .u64 Unknown27_param_5,
-	.param .u64 Unknown27_param_6,
-	.param .u64 Unknown27_param_7,
-	.param .u64 Unknown27_param_8,
-	.param .u64 Unknown27_param_9,
-	.param .u64 Unknown27_param_10,
-	.param .u64 Unknown27_param_11,
-	.param .u64 Unknown27_param_12,
-	.param .u64 Unknown27_param_13,
-	.param .u64 Unknown27_param_14,
-	.param .u64 Unknown27_param_15,
-	.param .u64 Unknown27_param_16,
-	.param .u64 Unknown27_param_17,
-	.param .u64 Unknown27_param_18,
-	.param .u64 Unknown27_param_19,
-	.param .u64 Unknown27_param_20,
-	.param .u64 Unknown27_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB62_2;
-	ld.param.u64 	%rd4, [Unknown27_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown27_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB62_2:
-	ret;
-
-}
-	// .globl	Unknown26
-.visible .entry Unknown26(
-	.param .u64 Unknown26_param_0,
-	.param .u64 Unknown26_param_1,
-	.param .u64 Unknown26_param_2,
-	.param .u64 Unknown26_param_3,
-	.param .u64 Unknown26_param_4,
-	.param .u64 Unknown26_param_5,
-	.param .u64 Unknown26_param_6,
-	.param .u64 Unknown26_param_7,
-	.param .u64 Unknown26_param_8,
-	.param .u64 Unknown26_param_9,
-	.param .u64 Unknown26_param_10,
-	.param .u64 Unknown26_param_11,
-	.param .u64 Unknown26_param_12,
-	.param .u64 Unknown26_param_13,
-	.param .u64 Unknown26_param_14,
-	.param .u64 Unknown26_param_15,
-	.param .u64 Unknown26_param_16,
-	.param .u64 Unknown26_param_17,
-	.param .u64 Unknown26_param_18,
-	.param .u64 Unknown26_param_19,
-	.param .u64 Unknown26_param_20,
-	.param .u64 Unknown26_param_21
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 100351;
-	@%p1 bra 	$L__BB63_2;
-	ld.param.u64 	%rd4, [Unknown26_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown26_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 3;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 28;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 28;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 3;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 3;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 28;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 28;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 3;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 784;
-	mul.lo.s64 	%rd39, %rd30, 28;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB63_2:
-	ret;
-
-}
-	// .globl	Unknown24
-.visible .entry Unknown24(
-	.param .u64 Unknown24_param_0,
-	.param .u64 Unknown24_param_1,
-	.param .u64 Unknown24_param_2,
-	.param .u64 Unknown24_param_3,
-	.param .u64 Unknown24_param_4,
-	.param .u64 Unknown24_param_5,
-	.param .u64 Unknown24_param_6,
-	.param .u64 Unknown24_param_7,
-	.param .u64 Unknown24_param_8,
-	.param .u64 Unknown24_param_9,
-	.param .u64 Unknown24_param_10,
-	.param .u64 Unknown24_param_11,
-	.param .u64 Unknown24_param_12,
-	.param .u64 Unknown24_param_13,
-	.param .u64 Unknown24_param_14,
-	.param .u64 Unknown24_param_15,
-	.param .u64 Unknown24_param_16,
-	.param .u64 Unknown24_param_17,
-	.param .u64 Unknown24_param_18,
-	.param .u64 Unknown24_param_19,
-	.param .u64 Unknown24_param_20,
-	.param .u64 Unknown24_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB64_2;
-	ld.param.u64 	%rd4, [Unknown24_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown24_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB64_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 32767;
+	@%p1 bra 	$L__BB16_3;
+	ld.param.u64 	%rd15, [Unknown30_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown30_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB16_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 32768;
+	@%p2 bra 	$L__BB16_2;
+$L__BB16_3:
 	ret;
 
 }
@@ -4462,74 +1138,44 @@ $L__BB64_2:
 	.param .u64 Unknown23_param_32
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 100351;
-	@%p1 bra 	$L__BB65_2;
-	ld.param.u64 	%rd5, [Unknown23_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown23_param_1];
-	ld.param.u64 	%rd7, [Unknown23_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 3;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 28;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 28;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 3;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 3;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 28;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 28;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 3;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 784;
-	mul.lo.s64 	%rd41, %rd32, 28;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB65_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<6>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 100351;
+	@%p1 bra 	$L__BB17_3;
+	ld.param.u64 	%rd12, [Unknown23_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown23_param_1];
+	ld.param.u64 	%rd14, [Unknown23_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB17_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	mov.b16 	%rs4, 0x0000;
+	max.NaN.f16 	%rs5, %rs3, %rs4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 100352;
+	@%p2 bra 	$L__BB17_2;
+$L__BB17_3:
 	ret;
 
 }
@@ -4559,80 +1205,42 @@ $L__BB65_2:
 	.param .u64 Unknown21_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 147455;
-	@%p1 bra 	$L__BB66_2;
-	ld.param.u64 	%rd4, [Unknown21_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown21_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 57;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -128;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 128;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 57;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 7;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 1152;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB66_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 147455;
+	@%p1 bra 	$L__BB18_3;
+	ld.param.u64 	%rd15, [Unknown21_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown21_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB18_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 147456;
+	@%p2 bra 	$L__BB18_2;
+$L__BB18_3:
 	ret;
 
 }
@@ -4662,69 +1270,39 @@ $L__BB66_2:
 	.param .u64 Unknown20_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 100351;
-	@%p1 bra 	$L__BB67_2;
-	ld.param.u64 	%rd4, [Unknown20_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown20_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 3;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 28;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 28;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 3;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 3;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 28;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 28;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 3;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 784;
-	mul.lo.s64 	%rd39, %rd30, 28;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB67_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 100351;
+	@%p1 bra 	$L__BB19_3;
+	ld.param.u64 	%rd11, [Unknown20_param_12];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown20_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB19_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x0000;
+	max.NaN.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 100352;
+	@%p2 bra 	$L__BB19_2;
+$L__BB19_3:
 	ret;
 
 }
@@ -4754,80 +1332,42 @@ $L__BB67_2:
 	.param .u64 Unknown18_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 73727;
-	@%p1 bra 	$L__BB68_2;
-	ld.param.u64 	%rd4, [Unknown18_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown18_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB68_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 73727;
+	@%p1 bra 	$L__BB20_3;
+	ld.param.u64 	%rd15, [Unknown18_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown18_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB20_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 73728;
+	@%p2 bra 	$L__BB20_2;
+$L__BB20_3:
 	ret;
 
 }
@@ -4858,453 +1398,41 @@ $L__BB68_2:
 )
 {
 	.reg .pred 	%p<3>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<27>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 8191;
-	@%p1 bra 	$L__BB69_2;
-	ld.param.u64 	%rd4, [Unknown16_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown16_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	shr.s64 	%rd8, %rd3, 63;
-	shr.u64 	%rd9, %rd8, 58;
-	add.s64 	%rd10, %rd3, %rd9;
-	and.b64  	%rd11, %rd10, -64;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 64;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	xor.b64  	%rd15, %rd8, %rd3;
-	shr.s64 	%rd16, %rd15, 63;
-	shr.u64 	%rd17, %rd16, 58;
-	add.s64 	%rd18, %rd15, %rd17;
-	shr.u64 	%rd19, %rd18, 6;
-	xor.b64  	%rd20, %rd19, %rd8;
-	shl.b64 	%rd21, %rd20, 6;
-	add.s64 	%rd22, %rd21, %rd14;
-	shl.b64 	%rd23, %rd22, 2;
-	add.s64 	%rd24, %rd2, %rd23;
-	ld.global.f32 	%f1, [%rd24];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd25, %rd22, 1;
-	add.s64 	%rd26, %rd1, %rd25;
-	st.global.b16 	[%rd26], %h1;
-$L__BB69_2:
-	ret;
-
-}
-	// .globl	Unknown15
-.visible .entry Unknown15(
-	.param .u64 Unknown15_param_0,
-	.param .u64 Unknown15_param_1,
-	.param .u64 Unknown15_param_2,
-	.param .u64 Unknown15_param_3,
-	.param .u64 Unknown15_param_4,
-	.param .u64 Unknown15_param_5,
-	.param .u64 Unknown15_param_6,
-	.param .u64 Unknown15_param_7,
-	.param .u64 Unknown15_param_8,
-	.param .u64 Unknown15_param_9,
-	.param .u64 Unknown15_param_10,
-	.param .u64 Unknown15_param_11,
-	.param .u64 Unknown15_param_12,
-	.param .u64 Unknown15_param_13,
-	.param .u64 Unknown15_param_14,
-	.param .u64 Unknown15_param_15,
-	.param .u64 Unknown15_param_16,
-	.param .u64 Unknown15_param_17,
-	.param .u64 Unknown15_param_18,
-	.param .u64 Unknown15_param_19,
-	.param .u64 Unknown15_param_20,
-	.param .u64 Unknown15_param_21,
-	.param .u64 Unknown15_param_22,
-	.param .u64 Unknown15_param_23,
-	.param .u64 Unknown15_param_24,
-	.param .u64 Unknown15_param_25,
-	.param .u64 Unknown15_param_26,
-	.param .u64 Unknown15_param_27,
-	.param .u64 Unknown15_param_28,
-	.param .u64 Unknown15_param_29,
-	.param .u64 Unknown15_param_30,
-	.param .u64 Unknown15_param_31,
-	.param .u64 Unknown15_param_32
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 200703;
-	@%p1 bra 	$L__BB70_2;
-	ld.param.u64 	%rd5, [Unknown15_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown15_param_1];
-	ld.param.u64 	%rd7, [Unknown15_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 4;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 56;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 56;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 4;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 4;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 56;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 56;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 4;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 3136;
-	mul.lo.s64 	%rd41, %rd32, 56;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB70_2:
-	ret;
-
-}
-	// .globl	Unknown13
-.visible .entry Unknown13(
-	.param .u64 Unknown13_param_0,
-	.param .u64 Unknown13_param_1,
-	.param .u64 Unknown13_param_2,
-	.param .u64 Unknown13_param_3,
-	.param .u64 Unknown13_param_4,
-	.param .u64 Unknown13_param_5,
-	.param .u64 Unknown13_param_6,
-	.param .u64 Unknown13_param_7,
-	.param .u64 Unknown13_param_8,
-	.param .u64 Unknown13_param_9,
-	.param .u64 Unknown13_param_10,
-	.param .u64 Unknown13_param_11,
-	.param .u64 Unknown13_param_12,
-	.param .u64 Unknown13_param_13,
-	.param .u64 Unknown13_param_14,
-	.param .u64 Unknown13_param_15,
-	.param .u64 Unknown13_param_16,
-	.param .u64 Unknown13_param_17,
-	.param .u64 Unknown13_param_18,
-	.param .u64 Unknown13_param_19,
-	.param .u64 Unknown13_param_20,
-	.param .u64 Unknown13_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB71_2;
-	ld.param.u64 	%rd4, [Unknown13_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown13_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB71_2:
-	ret;
-
-}
-	// .globl	Unknown12
-.visible .entry Unknown12(
-	.param .u64 Unknown12_param_0,
-	.param .u64 Unknown12_param_1,
-	.param .u64 Unknown12_param_2,
-	.param .u64 Unknown12_param_3,
-	.param .u64 Unknown12_param_4,
-	.param .u64 Unknown12_param_5,
-	.param .u64 Unknown12_param_6,
-	.param .u64 Unknown12_param_7,
-	.param .u64 Unknown12_param_8,
-	.param .u64 Unknown12_param_9,
-	.param .u64 Unknown12_param_10,
-	.param .u64 Unknown12_param_11,
-	.param .u64 Unknown12_param_12,
-	.param .u64 Unknown12_param_13,
-	.param .u64 Unknown12_param_14,
-	.param .u64 Unknown12_param_15,
-	.param .u64 Unknown12_param_16,
-	.param .u64 Unknown12_param_17,
-	.param .u64 Unknown12_param_18,
-	.param .u64 Unknown12_param_19,
-	.param .u64 Unknown12_param_20,
-	.param .u64 Unknown12_param_21
-)
-{
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 200703;
-	@%p1 bra 	$L__BB72_2;
-	ld.param.u64 	%rd4, [Unknown12_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown12_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 4;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 56;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 56;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 4;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 4;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 56;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 56;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 4;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 3136;
-	mul.lo.s64 	%rd39, %rd30, 56;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB72_2:
-	ret;
-
-}
-	// .globl	Unknown10
-.visible .entry Unknown10(
-	.param .u64 Unknown10_param_0,
-	.param .u64 Unknown10_param_1,
-	.param .u64 Unknown10_param_2,
-	.param .u64 Unknown10_param_3,
-	.param .u64 Unknown10_param_4,
-	.param .u64 Unknown10_param_5,
-	.param .u64 Unknown10_param_6,
-	.param .u64 Unknown10_param_7,
-	.param .u64 Unknown10_param_8,
-	.param .u64 Unknown10_param_9,
-	.param .u64 Unknown10_param_10,
-	.param .u64 Unknown10_param_11,
-	.param .u64 Unknown10_param_12,
-	.param .u64 Unknown10_param_13,
-	.param .u64 Unknown10_param_14,
-	.param .u64 Unknown10_param_15,
-	.param .u64 Unknown10_param_16,
-	.param .u64 Unknown10_param_17,
-	.param .u64 Unknown10_param_18,
-	.param .u64 Unknown10_param_19,
-	.param .u64 Unknown10_param_20,
-	.param .u64 Unknown10_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB73_2;
-	ld.param.u64 	%rd4, [Unknown10_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown10_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB73_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 8191;
+	@%p1 bra 	$L__BB21_3;
+	ld.param.u64 	%rd15, [Unknown16_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown16_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB21_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 8192;
+	@%p2 bra 	$L__BB21_2;
+$L__BB21_3:
 	ret;
 
 }
@@ -5345,177 +1473,44 @@ $L__BB73_2:
 	.param .u64 Unknown9_param_32
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<5>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<48>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd8, %r3;
-	mul.wide.s32 	%rd9, %r2, %r1;
-	add.s64 	%rd4, %rd9, %rd8;
-	setp.gt.s64 	%p1, %rd4, 200703;
-	@%p1 bra 	$L__BB74_2;
-	ld.param.u64 	%rd5, [Unknown9_param_23];
-	cvta.to.global.u64 	%rd1, %rd5;
-	ld.param.u64 	%rd6, [Unknown9_param_1];
-	ld.param.u64 	%rd7, [Unknown9_param_12];
-	cvta.to.global.u64 	%rd2, %rd7;
-	cvta.to.global.u64 	%rd3, %rd6;
-	mul.hi.s64 	%rd10, %rd4, 5270498306774157605;
-	shr.u64 	%rd11, %rd10, 63;
-	shr.s64 	%rd12, %rd10, 4;
-	add.s64 	%rd13, %rd12, %rd11;
-	mul.lo.s64 	%rd14, %rd13, 56;
-	sub.s64 	%rd15, %rd4, %rd14;
-	setp.lt.s64 	%p2, %rd15, 0;
-	add.s64 	%rd16, %rd15, 56;
-	selp.b64 	%rd17, %rd16, %rd15, %p2;
-	shr.s64 	%rd18, %rd4, 63;
-	xor.b64  	%rd19, %rd18, %rd4;
-	mul.hi.s64 	%rd20, %rd19, 5270498306774157605;
-	shr.u64 	%rd21, %rd20, 63;
-	shr.s64 	%rd22, %rd20, 4;
-	add.s64 	%rd23, %rd22, %rd21;
-	xor.b64  	%rd24, %rd23, %rd18;
-	mul.hi.s64 	%rd25, %rd24, 5270498306774157605;
-	shr.u64 	%rd26, %rd25, 63;
-	shr.s64 	%rd27, %rd25, 4;
-	add.s64 	%rd28, %rd27, %rd26;
-	mul.lo.s64 	%rd29, %rd28, 56;
-	sub.s64 	%rd30, %rd24, %rd29;
-	setp.lt.s64 	%p3, %rd30, 0;
-	add.s64 	%rd31, %rd30, 56;
-	selp.b64 	%rd32, %rd31, %rd30, %p3;
-	shr.s64 	%rd33, %rd24, 63;
-	xor.b64  	%rd34, %rd33, %rd24;
-	mul.hi.s64 	%rd35, %rd34, 5270498306774157605;
-	shr.u64 	%rd36, %rd35, 63;
-	shr.s64 	%rd37, %rd35, 4;
-	add.s64 	%rd38, %rd37, %rd36;
-	xor.b64  	%rd39, %rd38, %rd33;
-	mul.lo.s64 	%rd40, %rd39, 3136;
-	mul.lo.s64 	%rd41, %rd32, 56;
-	add.s64 	%rd42, %rd41, %rd17;
-	add.s64 	%rd43, %rd42, %rd40;
-	shl.b64 	%rd44, %rd43, 1;
-	add.s64 	%rd45, %rd3, %rd44;
-	ld.global.b16 	%h1, [%rd45];
-	add.s64 	%rd46, %rd2, %rd44;
-	ld.global.b16 	%h2, [%rd46];
-	add.rn.f16 	%h3, %h1, %h2;
-	cvt.f32.f16 	%f1, %h3;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h4, %f2;
-	add.s64 	%rd47, %rd1, %rd44;
-	st.global.b16 	[%rd47], %h4;
-$L__BB74_2:
-	ret;
-
-}
-	// .globl	Unknown7
-.visible .entry Unknown7(
-	.param .u64 Unknown7_param_0,
-	.param .u64 Unknown7_param_1,
-	.param .u64 Unknown7_param_2,
-	.param .u64 Unknown7_param_3,
-	.param .u64 Unknown7_param_4,
-	.param .u64 Unknown7_param_5,
-	.param .u64 Unknown7_param_6,
-	.param .u64 Unknown7_param_7,
-	.param .u64 Unknown7_param_8,
-	.param .u64 Unknown7_param_9,
-	.param .u64 Unknown7_param_10,
-	.param .u64 Unknown7_param_11,
-	.param .u64 Unknown7_param_12,
-	.param .u64 Unknown7_param_13,
-	.param .u64 Unknown7_param_14,
-	.param .u64 Unknown7_param_15,
-	.param .u64 Unknown7_param_16,
-	.param .u64 Unknown7_param_17,
-	.param .u64 Unknown7_param_18,
-	.param .u64 Unknown7_param_19,
-	.param .u64 Unknown7_param_20,
-	.param .u64 Unknown7_param_21
-)
-{
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB75_2;
-	ld.param.u64 	%rd4, [Unknown7_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown7_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB75_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<6>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<22>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd15, %r3;
+	mul.wide.s32 	%rd16, %r2, %r1;
+	add.s64 	%rd21, %rd16, %rd15;
+	setp.gt.s64 	%p1, %rd21, 200703;
+	@%p1 bra 	$L__BB22_3;
+	ld.param.u64 	%rd12, [Unknown9_param_23];
+	cvta.to.global.u64 	%rd1, %rd12;
+	ld.param.u64 	%rd13, [Unknown9_param_1];
+	ld.param.u64 	%rd14, [Unknown9_param_12];
+	cvta.to.global.u64 	%rd2, %rd14;
+	cvta.to.global.u64 	%rd3, %rd13;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd5, %r2, %r4;
+	shl.b64 	%rd20, %rd21, 1;
+	shl.b64 	%rd7, %rd5, 1;
+$L__BB22_2:
+	add.s64 	%rd17, %rd3, %rd20;
+	ld.global.nc.u16 	%rs1, [%rd17];
+	add.s64 	%rd18, %rd2, %rd20;
+	ld.global.nc.u16 	%rs2, [%rd18];
+	add.rn.f16 	%rs3, %rs1, %rs2;
+	mov.b16 	%rs4, 0x0000;
+	max.NaN.f16 	%rs5, %rs3, %rs4;
+	add.s64 	%rd19, %rd1, %rd20;
+	st.global.b16 	[%rd19], %rs5;
+	add.s64 	%rd21, %rd21, %rd5;
+	add.s64 	%rd20, %rd20, %rd7;
+	setp.lt.s64 	%p2, %rd21, 200704;
+	@%p2 bra 	$L__BB22_2;
+$L__BB22_3:
 	ret;
 
 }
@@ -5545,69 +1540,39 @@ $L__BB75_2:
 	.param .u64 Unknown6_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 200703;
-	@%p1 bra 	$L__BB76_2;
-	ld.param.u64 	%rd4, [Unknown6_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown6_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 4;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 56;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 56;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 4;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 4;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 56;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 56;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 4;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 3136;
-	mul.lo.s64 	%rd39, %rd30, 56;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB76_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 200703;
+	@%p1 bra 	$L__BB23_3;
+	ld.param.u64 	%rd11, [Unknown6_param_12];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown6_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB23_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x0000;
+	max.NaN.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 200704;
+	@%p2 bra 	$L__BB23_2;
+$L__BB23_3:
 	ret;
 
 }
@@ -5637,80 +1602,42 @@ $L__BB76_2:
 	.param .u64 Unknown4_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<57>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 36863;
-	@%p1 bra 	$L__BB77_2;
-	ld.param.u64 	%rd4, [Unknown4_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown4_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 6148914691236517206;
-	shr.u64 	%rd9, %rd8, 63;
-	add.s64 	%rd10, %rd8, %rd9;
-	mul.lo.s64 	%rd11, %rd10, 3;
-	sub.s64 	%rd12, %rd3, %rd11;
-	setp.lt.s64 	%p2, %rd12, 0;
-	add.s64 	%rd13, %rd12, 3;
-	selp.b64 	%rd14, %rd13, %rd12, %p2;
-	shr.s64 	%rd15, %rd3, 63;
-	xor.b64  	%rd16, %rd15, %rd3;
-	mul.hi.s64 	%rd17, %rd16, 6148914691236517206;
-	shr.u64 	%rd18, %rd17, 63;
-	add.s64 	%rd19, %rd17, %rd18;
-	xor.b64  	%rd20, %rd19, %rd15;
-	mul.hi.s64 	%rd21, %rd20, 6148914691236517206;
-	shr.u64 	%rd22, %rd21, 63;
-	add.s64 	%rd23, %rd21, %rd22;
-	mul.lo.s64 	%rd24, %rd23, 3;
-	sub.s64 	%rd25, %rd20, %rd24;
-	setp.lt.s64 	%p3, %rd25, 0;
-	add.s64 	%rd26, %rd25, 3;
-	selp.b64 	%rd27, %rd26, %rd25, %p3;
-	shr.s64 	%rd28, %rd20, 63;
-	xor.b64  	%rd29, %rd28, %rd20;
-	mul.hi.s64 	%rd30, %rd29, 6148914691236517206;
-	shr.u64 	%rd31, %rd30, 63;
-	add.s64 	%rd32, %rd30, %rd31;
-	xor.b64  	%rd33, %rd32, %rd28;
-	shr.s64 	%rd34, %rd33, 63;
-	shr.u64 	%rd35, %rd34, 58;
-	add.s64 	%rd36, %rd33, %rd35;
-	and.b64  	%rd37, %rd36, -64;
-	sub.s64 	%rd38, %rd33, %rd37;
-	setp.lt.s64 	%p4, %rd38, 0;
-	add.s64 	%rd39, %rd38, 64;
-	selp.b64 	%rd40, %rd39, %rd38, %p4;
-	xor.b64  	%rd41, %rd34, %rd33;
-	shr.s64 	%rd42, %rd41, 63;
-	shr.u64 	%rd43, %rd42, 58;
-	add.s64 	%rd44, %rd41, %rd43;
-	shr.s64 	%rd45, %rd44, 6;
-	xor.b64  	%rd46, %rd45, %rd34;
-	mul.lo.s64 	%rd47, %rd46, 576;
-	mul.lo.s64 	%rd48, %rd40, 9;
-	mul.lo.s64 	%rd49, %rd27, 3;
-	add.s64 	%rd50, %rd49, %rd14;
-	add.s64 	%rd51, %rd50, %rd48;
-	add.s64 	%rd52, %rd51, %rd47;
-	shl.b64 	%rd53, %rd52, 2;
-	add.s64 	%rd54, %rd2, %rd53;
-	ld.global.f32 	%f1, [%rd54];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd55, %rd52, 1;
-	add.s64 	%rd56, %rd1, %rd55;
-	st.global.b16 	[%rd56], %h1;
-$L__BB77_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 36863;
+	@%p1 bra 	$L__BB24_3;
+	ld.param.u64 	%rd15, [Unknown4_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown4_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB24_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 36864;
+	@%p2 bra 	$L__BB24_2;
+$L__BB24_3:
 	ret;
 
 }
@@ -5740,69 +1667,39 @@ $L__BB77_2:
 	.param .u64 Unknown3_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<3>;
-	.reg .b32 	%r<4>;
-	.reg .f32 	%f<3>;
-	.reg .b64 	%rd<45>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 802815;
-	@%p1 bra 	$L__BB78_2;
-	ld.param.u64 	%rd4, [Unknown3_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown3_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 5;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 112;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 112;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 5;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 5;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 112;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 112;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 5;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 12544;
-	mul.lo.s64 	%rd39, %rd30, 112;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 1;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.b16 	%h1, [%rd43];
-	cvt.f32.f16 	%f1, %h1;
-	max.f32 	%f2, %f1, 0f00000000;
-	cvt.rn.f16.f32 	%h2, %f2;
-	add.s64 	%rd44, %rd1, %rd42;
-	st.global.b16 	[%rd44], %h2;
-$L__BB78_2:
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<4>;
+	.reg .b32 	%r<5>;
+	.reg .b64 	%rd<19>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd13, %r3;
+	mul.wide.s32 	%rd14, %r2, %r1;
+	add.s64 	%rd18, %rd14, %rd13;
+	setp.gt.s64 	%p1, %rd18, 802815;
+	@%p1 bra 	$L__BB25_3;
+	ld.param.u64 	%rd11, [Unknown3_param_12];
+	cvta.to.global.u64 	%rd1, %rd11;
+	ld.param.u64 	%rd12, [Unknown3_param_1];
+	cvta.to.global.u64 	%rd2, %rd12;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd17, %rd18, 1;
+	shl.b64 	%rd6, %rd4, 1;
+$L__BB25_2:
+	add.s64 	%rd15, %rd2, %rd17;
+	ld.global.nc.u16 	%rs1, [%rd15];
+	mov.b16 	%rs2, 0x0000;
+	max.NaN.f16 	%rs3, %rs1, %rs2;
+	add.s64 	%rd16, %rd1, %rd17;
+	st.global.b16 	[%rd16], %rs3;
+	add.s64 	%rd18, %rd18, %rd4;
+	add.s64 	%rd17, %rd17, %rd6;
+	setp.lt.s64 	%p2, %rd18, 802816;
+	@%p2 bra 	$L__BB25_2;
+$L__BB25_3:
 	ret;
 
 }
@@ -5832,84 +1729,42 @@ $L__BB78_2:
 	.param .u64 Unknown1_param_21
 )
 {
-	.reg .pred 	%p<5>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<61>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 9407;
-	@%p1 bra 	$L__BB79_2;
-	ld.param.u64 	%rd4, [Unknown1_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown1_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 1;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 7;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 7;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 1;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 1;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 7;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 7;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 1;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.hi.s64 	%rd38, %rd37, 6148914691236517206;
-	shr.u64 	%rd39, %rd38, 63;
-	add.s64 	%rd40, %rd38, %rd39;
-	mul.lo.s64 	%rd41, %rd40, 3;
-	sub.s64 	%rd42, %rd37, %rd41;
-	setp.lt.s64 	%p4, %rd42, 0;
-	add.s64 	%rd43, %rd42, 3;
-	selp.b64 	%rd44, %rd43, %rd42, %p4;
-	shr.s64 	%rd45, %rd37, 63;
-	xor.b64  	%rd46, %rd45, %rd37;
-	mul.hi.s64 	%rd47, %rd46, 6148914691236517206;
-	shr.u64 	%rd48, %rd47, 63;
-	add.s64 	%rd49, %rd47, %rd48;
-	xor.b64  	%rd50, %rd49, %rd45;
-	mul.lo.s64 	%rd51, %rd50, 147;
-	mul.lo.s64 	%rd52, %rd44, 49;
-	mul.lo.s64 	%rd53, %rd30, 7;
-	add.s64 	%rd54, %rd53, %rd15;
-	add.s64 	%rd55, %rd54, %rd52;
-	add.s64 	%rd56, %rd55, %rd51;
-	shl.b64 	%rd57, %rd56, 2;
-	add.s64 	%rd58, %rd2, %rd57;
-	ld.global.f32 	%f1, [%rd58];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd59, %rd56, 1;
-	add.s64 	%rd60, %rd1, %rd59;
-	st.global.b16 	[%rd60], %h1;
-$L__BB79_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 9407;
+	@%p1 bra 	$L__BB26_3;
+	ld.param.u64 	%rd15, [Unknown1_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown1_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB26_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 9408;
+	@%p2 bra 	$L__BB26_2;
+$L__BB26_3:
 	ret;
 
 }
@@ -5939,68 +1794,184 @@ $L__BB79_2:
 	.param .u64 Unknown0_param_21
 )
 {
-	.reg .pred 	%p<4>;
-	.reg .b16 	%h<2>;
-	.reg .b32 	%r<4>;
+	.reg .pred 	%p<3>;
+	.reg .b16 	%rs<2>;
+	.reg .b32 	%r<5>;
 	.reg .f32 	%f<2>;
-	.reg .b64 	%rd<46>;
-
-	mov.u32 	%r1, %ctaid.x;
-	mov.u32 	%r2, %ntid.x;
-	mov.u32 	%r3, %tid.x;
-	cvt.s64.s32 	%rd6, %r3;
-	mul.wide.s32 	%rd7, %r2, %r1;
-	add.s64 	%rd3, %rd7, %rd6;
-	setp.gt.s64 	%p1, %rd3, 150527;
-	@%p1 bra 	$L__BB80_2;
-	ld.param.u64 	%rd4, [Unknown0_param_12];
-	cvta.to.global.u64 	%rd1, %rd4;
-	ld.param.u64 	%rd5, [Unknown0_param_1];
-	cvta.to.global.u64 	%rd2, %rd5;
-	mul.hi.s64 	%rd8, %rd3, 5270498306774157605;
-	shr.u64 	%rd9, %rd8, 63;
-	shr.s64 	%rd10, %rd8, 6;
-	add.s64 	%rd11, %rd10, %rd9;
-	mul.lo.s64 	%rd12, %rd11, 224;
-	sub.s64 	%rd13, %rd3, %rd12;
-	setp.lt.s64 	%p2, %rd13, 0;
-	add.s64 	%rd14, %rd13, 224;
-	selp.b64 	%rd15, %rd14, %rd13, %p2;
-	shr.s64 	%rd16, %rd3, 63;
-	xor.b64  	%rd17, %rd16, %rd3;
-	mul.hi.s64 	%rd18, %rd17, 5270498306774157605;
-	shr.u64 	%rd19, %rd18, 63;
-	shr.s64 	%rd20, %rd18, 6;
-	add.s64 	%rd21, %rd20, %rd19;
-	xor.b64  	%rd22, %rd21, %rd16;
-	mul.hi.s64 	%rd23, %rd22, 5270498306774157605;
-	shr.u64 	%rd24, %rd23, 63;
-	shr.s64 	%rd25, %rd23, 6;
-	add.s64 	%rd26, %rd25, %rd24;
-	mul.lo.s64 	%rd27, %rd26, 224;
-	sub.s64 	%rd28, %rd22, %rd27;
-	setp.lt.s64 	%p3, %rd28, 0;
-	add.s64 	%rd29, %rd28, 224;
-	selp.b64 	%rd30, %rd29, %rd28, %p3;
-	shr.s64 	%rd31, %rd22, 63;
-	xor.b64  	%rd32, %rd31, %rd22;
-	mul.hi.s64 	%rd33, %rd32, 5270498306774157605;
-	shr.u64 	%rd34, %rd33, 63;
-	shr.s64 	%rd35, %rd33, 6;
-	add.s64 	%rd36, %rd35, %rd34;
-	xor.b64  	%rd37, %rd36, %rd31;
-	mul.lo.s64 	%rd38, %rd37, 50176;
-	mul.lo.s64 	%rd39, %rd30, 224;
-	add.s64 	%rd40, %rd39, %rd15;
-	add.s64 	%rd41, %rd40, %rd38;
-	shl.b64 	%rd42, %rd41, 2;
-	add.s64 	%rd43, %rd2, %rd42;
-	ld.global.f32 	%f1, [%rd43];
-	cvt.rn.f16.f32 	%h1, %f1;
-	shl.b64 	%rd44, %rd41, 1;
-	add.s64 	%rd45, %rd1, %rd44;
-	st.global.b16 	[%rd45], %h1;
-$L__BB80_2:
+	.reg .b64 	%rd<24>;
+
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %ntid.x;
+	mov.u32 	%r3, %tid.x;
+	cvt.s64.s32 	%rd17, %r3;
+	mul.wide.s32 	%rd18, %r2, %r1;
+	add.s64 	%rd23, %rd18, %rd17;
+	setp.gt.s64 	%p1, %rd23, 150527;
+	@%p1 bra 	$L__BB27_3;
+	ld.param.u64 	%rd15, [Unknown0_param_12];
+	cvta.to.global.u64 	%rd1, %rd15;
+	ld.param.u64 	%rd16, [Unknown0_param_1];
+	cvta.to.global.u64 	%rd2, %rd16;
+	mov.u32 	%r4, %nctaid.x;
+	mul.wide.s32 	%rd4, %r2, %r4;
+	shl.b64 	%rd19, %rd23, 2;
+	add.s64 	%rd22, %rd2, %rd19;
+	shl.b64 	%rd6, %rd4, 2;
+	shl.b64 	%rd20, %rd23, 1;
+	add.s64 	%rd21, %rd1, %rd20;
+	shl.b64 	%rd8, %rd4, 1;
+$L__BB27_2:
+	ld.global.nc.f32 	%f1, [%rd22];
+	cvt.rn.f16.f32 	%rs1, %f1;
+	st.global.b16 	[%rd21], %rs1;
+	add.s64 	%rd23, %rd23, %rd4;
+	add.s64 	%rd22, %rd22, %rd6;
+	add.s64 	%rd21, %rd21, %rd8;
+	setp.lt.s64 	%p2, %rd23, 150528;
+	@%p2 bra 	$L__BB27_2;
+$L__BB27_3:
+	ret;
+
+}
+	// .globl	Unknown58_kernel
+.visible .entry Unknown58_kernel(
+	.param .u64 Unknown58_kernel_param_0,
+	.param .u64 Unknown58_kernel_param_1,
+	.param .u64 Unknown58_kernel_param_2,
+	.param .u64 Unknown58_kernel_param_3,
+	.param .u64 Unknown58_kernel_param_4,
+	.param .u64 Unknown58_kernel_param_5,
+	.param .u64 Unknown58_kernel_param_6,
+	.param .u64 Unknown58_kernel_param_7,
+	.param .u64 Unknown58_kernel_param_8,
+	.param .u64 Unknown58_kernel_param_9,
+	.param .u64 Unknown58_kernel_param_10,
+	.param .u64 Unknown58_kernel_param_11
+)
+{
+	.reg .pred 	%p<8>;
+	.reg .b16 	%rs<37>;
+	.reg .b32 	%r<12>;
+	.reg .b64 	%rd<55>;
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_0[128];
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_1[64];
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_2[32];
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_3[16];
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_4[8];
+	// demoted variable
+	.shared .align 2 .b8 __wg_Unknown58_kernel_5[4];
+	mov.u32 	%r1, %ctaid.x;
+	mov.u32 	%r2, %tid.x;
+	cvt.s64.s32 	%rd3, %r2;
+	and.b64  	%rd9, %rd3, 63;
+	min.u64 	%rd10, %rd9, 49;
+	min.u64 	%rd11, %rd9, 48;
+	add.s64 	%rd12, %rd11, 1;
+	setp.eq.s64 	%p1, %rd12, %rd10;
+	mov.b16 	%rs36, 0x0000;
+	@%p1 bra 	$L__BB28_2;
+	ld.param.u64 	%rd7, [Unknown58_kernel_param_1];
+	cvta.to.global.u64 	%rd8, %rd7;
+	mul.wide.s32 	%rd13, %r1, 98;
+	add.s64 	%rd14, %rd8, %rd13;
+	shl.b64 	%rd15, %rd10, 1;
+	add.s64 	%rd4, %rd14, %rd15;
+	ld.global.nc.u16 	%rs4, [%rd4];
+	mov.b16 	%rs5, 0x0000;
+	add.rn.f16 	%rs36, %rs4, %rs5;
+$L__BB28_2:
+	cvt.u32.u64 	%r3, %rd3;
+	shl.b64 	%rd16, %rd3, 1;
+	mov.u64 	%rd17, __wg_Unknown58_kernel_0;
+	add.s64 	%rd5, %rd17, %rd16;
+	st.shared.b16 	[%rd5], %rs36;
+	bar.sync 	0;
+	setp.gt.u32 	%p2, %r3, 31;
+	mov.u64 	%rd54, __wg_Unknown58_kernel_1;
+	@%p2 bra 	$L__BB28_4;
+	add.s64 	%rd19, %rd5, %rd16;
+	ld.shared.b32 	%r4, [%rd19];
+	mov.b32 	{%rs6, %rs7}, %r4;
+	mov.b16 	%rs8, 0x0000;
+	add.rn.f16 	%rs9, %rs6, %rs8;
+	add.rn.f16 	%rs10, %rs7, %rs9;
+	add.s64 	%rd21, %rd54, %rd16;
+	st.shared.b16 	[%rd21], %rs10;
+$L__BB28_4:
+	bar.sync 	0;
+	setp.gt.u32 	%p3, %r3, 15;
+	shl.b64 	%rd52, %rd3, 2;
+	mov.u64 	%rd53, __wg_Unknown58_kernel_2;
+	@%p3 bra 	$L__BB28_6;
+	add.s64 	%rd25, %rd54, %rd52;
+	ld.shared.b32 	%r6, [%rd25];
+	mov.b32 	{%rs11, %rs12}, %r6;
+	mov.b16 	%rs13, 0x0000;
+	add.rn.f16 	%rs14, %rs11, %rs13;
+	add.rn.f16 	%rs15, %rs12, %rs14;
+	add.s64 	%rd27, %rd53, %rd16;
+	st.shared.b16 	[%rd27], %rs15;
+$L__BB28_6:
+	bar.sync 	0;
+	setp.gt.u32 	%p4, %r3, 7;
+	mov.u64 	%rd51, __wg_Unknown58_kernel_3;
+	@%p4 bra 	$L__BB28_8;
+	add.s64 	%rd31, %rd53, %rd52;
+	ld.shared.b32 	%r8, [%rd31];
+	mov.b32 	{%rs16, %rs17}, %r8;
+	mov.b16 	%rs18, 0x0000;
+	add.rn.f16 	%rs19, %rs16, %rs18;
+	add.rn.f16 	%rs20, %rs17, %rs19;
+	add.s64 	%rd33, %rd51, %rd16;
+	st.shared.b16 	[%rd33], %rs20;
+$L__BB28_8:
+	bar.sync 	0;
+	setp.gt.u32 	%p5, %r3, 3;
+	mov.u64 	%rd49, __wg_Unknown58_kernel_4;
+	@%p5 bra 	$L__BB28_10;
+	add.s64 	%rd37, %rd51, %rd52;
+	ld.shared.b16 	%rs21, [%rd37];
+	mov.b16 	%rs22, 0x0000;
+	add.rn.f16 	%rs23, %rs21, %rs22;
+	ld.shared.b16 	%rs24, [%rd37+2];
+	add.rn.f16 	%rs25, %rs24, %rs23;
+	add.s64 	%rd39, %rd49, %rd16;
+	st.shared.b16 	[%rd39], %rs25;
+$L__BB28_10:
+	bar.sync 	0;
+	setp.gt.u32 	%p6, %r3, 1;
+	@%p6 bra 	$L__BB28_12;
+	add.s64 	%rd43, %rd49, %rd52;
+	ld.shared.b16 	%rs26, [%rd43];
+	mov.b16 	%rs27, 0x0000;
+	add.rn.f16 	%rs28, %rs26, %rs27;
+	ld.shared.b16 	%rs29, [%rd43+2];
+	add.rn.f16 	%rs30, %rs29, %rs28;
+	mov.u64 	%rd44, __wg_Unknown58_kernel_5;
+	add.s64 	%rd45, %rd44, %rd16;
+	st.shared.b16 	[%rd45], %rs30;
+$L__BB28_12:
+	bar.sync 	0;
+	setp.ne.s32 	%p7, %r3, 0;
+	@%p7 bra 	$L__BB28_14;
+	ld.param.u64 	%rd6, [Unknown58_kernel_param_8];
+	cvta.to.global.u64 	%rd1, %rd6;
+	cvt.s64.s32 	%rd2, %r1;
+	ld.shared.b16 	%rs31, [__wg_Unknown58_kernel_5];
+	mov.b16 	%rs32, 0x0000;
+	add.rn.f16 	%rs33, %rs31, %rs32;
+	ld.shared.b16 	%rs34, [__wg_Unknown58_kernel_5+2];
+	add.rn.f16 	%rs35, %rs34, %rs33;
+	shl.b64 	%rd46, %rd2, 1;
+	add.s64 	%rd47, %rd1, %rd46;
+	st.global.b16 	[%rd47], %rs35;
+$L__BB28_14:
+	bar.sync 	0;
 	ret;
 
 }
diff --git a/compiler/test/E2E/ResNet18/FW/host_output.mlir b/compiler/test/E2E/ResNet18/FW/host_output.mlir
index 9dafad4e7..35356cb24 100644
--- a/compiler/test/E2E/ResNet18/FW/host_output.mlir
+++ b/compiler/test/E2E/ResNet18/FW/host_output.mlir
@@ -5,182 +5,184 @@
 module attributes {byre.container_module, gpu.container_module} {
   func.func @main(%arg0: memref<64xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<64xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<1000xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<1000x512xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<128xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<128xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<128xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<128xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<128xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<128xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<128xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<256xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<256xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<256xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<256xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<256xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<256xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<256xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<256xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<256xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<256xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<512xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<512xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<512xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<512xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<512xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<512xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<512xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<512xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<512xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<512xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<i64, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<64xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<64xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<i64, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<64xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<64xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<i64, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<64xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<64xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<i64, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<64xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<64xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<i64, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<64xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<64xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<i64, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<128xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<128xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<i64, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<128xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<128xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<i64, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<128xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<128xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<i64, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<128xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<128xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<i64, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<128xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<128xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<i64, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<256xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<256xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<i64, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<256xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<256xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<i64, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<256xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<256xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<i64, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<256xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<256xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<i64, "cuda"> {byre.argname = "Input104", byre.argtype = 1 : i32}, %arg105: memref<256xf32, "cuda"> {byre.argname = "Input105", byre.argtype = 1 : i32}, %arg106: memref<256xf32, "cuda"> {byre.argname = "Input106", byre.argtype = 1 : i32}, %arg107: memref<i64, "cuda"> {byre.argname = "Input107", byre.argtype = 1 : i32}, %arg108: memref<512xf32, "cuda"> {byre.argname = "Input108", byre.argtype = 1 : i32}, %arg109: memref<512xf32, "cuda"> {byre.argname = "Input109", byre.argtype = 1 : i32}, %arg110: memref<i64, "cuda"> {byre.argname = "Input110", byre.argtype = 1 : i32}, %arg111: memref<512xf32, "cuda"> {byre.argname = "Input111", byre.argtype = 1 : i32}, %arg112: memref<512xf32, "cuda"> {byre.argname = "Input112", byre.argtype = 1 : i32}, %arg113: memref<i64, "cuda"> {byre.argname = "Input113", byre.argtype = 1 : i32}, %arg114: memref<512xf32, "cuda"> {byre.argname = "Input114", byre.argtype = 1 : i32}, %arg115: memref<512xf32, "cuda"> {byre.argname = "Input115", byre.argtype = 1 : i32}, %arg116: memref<i64, "cuda"> {byre.argname = "Input116", byre.argtype = 1 : i32}, %arg117: memref<512xf32, "cuda"> {byre.argname = "Input117", byre.argtype = 1 : i32}, %arg118: memref<512xf32, "cuda"> {byre.argname = "Input118", byre.argtype = 1 : i32}, %arg119: memref<i64, "cuda"> {byre.argname = "Input119", byre.argtype = 1 : i32}, %arg120: memref<512xf32, "cuda"> {byre.argname = "Input120", byre.argtype = 1 : i32}, %arg121: memref<512xf32, "cuda"> {byre.argname = "Input121", byre.argtype = 1 : i32}, %arg122: memref<1x3x224x224xf32, "cuda"> {byre.argname = "Input122", byre.argtype = 1 : i32}, %arg123: memref<1x1000xf16, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg124: memref<64xf32, "cuda"> {byre.arg_alias_index = 0 : i64, byre.argname = "Output1", byre.argtype = 2 : i32}, %arg125: memref<64xf32, "cuda"> {byre.arg_alias_index = 1 : i64, byre.argname = "Output2", byre.argtype = 2 : i32}, %arg126: memref<64xf32, "cuda"> {byre.arg_alias_index = 5 : i64, byre.argname = "Output3", byre.argtype = 2 : i32}, %arg127: memref<64xf32, "cuda"> {byre.arg_alias_index = 6 : i64, byre.argname = "Output4", byre.argtype = 2 : i32}, %arg128: memref<64xf32, "cuda"> {byre.arg_alias_index = 7 : i64, byre.argname = "Output5", byre.argtype = 2 : i32}, %arg129: memref<64xf32, "cuda"> {byre.arg_alias_index = 8 : i64, byre.argname = "Output6", byre.argtype = 2 : i32}, %arg130: memref<64xf32, "cuda"> {byre.arg_alias_index = 11 : i64, byre.argname = "Output7", byre.argtype = 2 : i32}, %arg131: memref<64xf32, "cuda"> {byre.arg_alias_index = 12 : i64, byre.argname = "Output8", byre.argtype = 2 : i32}, %arg132: memref<64xf32, "cuda"> {byre.arg_alias_index = 13 : i64, byre.argname = "Output9", byre.argtype = 2 : i32}, %arg133: memref<64xf32, "cuda"> {byre.arg_alias_index = 14 : i64, byre.argname = "Output10", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.arg_alias_index = 17 : i64, byre.argname = "Output11", byre.argtype = 2 : i32}, %arg135: memref<128xf32, "cuda"> {byre.arg_alias_index = 18 : i64, byre.argname = "Output12", byre.argtype = 2 : i32}, %arg136: memref<128xf32, "cuda"> {byre.arg_alias_index = 19 : i64, byre.argname = "Output13", byre.argtype = 2 : i32}, %arg137: memref<128xf32, "cuda"> {byre.arg_alias_index = 20 : i64, byre.argname = "Output14", byre.argtype = 2 : i32}, %arg138: memref<128xf32, "cuda"> {byre.arg_alias_index = 24 : i64, byre.argname = "Output15", byre.argtype = 2 : i32}, %arg139: memref<128xf32, "cuda"> {byre.arg_alias_index = 25 : i64, byre.argname = "Output16", byre.argtype = 2 : i32}, %arg140: memref<128xf32, "cuda"> {byre.arg_alias_index = 26 : i64, byre.argname = "Output17", byre.argtype = 2 : i32}, %arg141: memref<128xf32, "cuda"> {byre.arg_alias_index = 27 : i64, byre.argname = "Output18", byre.argtype = 2 : i32}, %arg142: memref<128xf32, "cuda"> {byre.arg_alias_index = 28 : i64, byre.argname = "Output19", byre.argtype = 2 : i32}, %arg143: memref<128xf32, "cuda"> {byre.arg_alias_index = 29 : i64, byre.argname = "Output20", byre.argtype = 2 : i32}, %arg144: memref<256xf32, "cuda"> {byre.arg_alias_index = 32 : i64, byre.argname = "Output21", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.arg_alias_index = 33 : i64, byre.argname = "Output22", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.arg_alias_index = 34 : i64, byre.argname = "Output23", byre.argtype = 2 : i32}, %arg147: memref<256xf32, "cuda"> {byre.arg_alias_index = 35 : i64, byre.argname = "Output24", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.arg_alias_index = 39 : i64, byre.argname = "Output25", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.arg_alias_index = 40 : i64, byre.argname = "Output26", byre.argtype = 2 : i32}, %arg150: memref<256xf32, "cuda"> {byre.arg_alias_index = 41 : i64, byre.argname = "Output27", byre.argtype = 2 : i32}, %arg151: memref<256xf32, "cuda"> {byre.arg_alias_index = 42 : i64, byre.argname = "Output28", byre.argtype = 2 : i32}, %arg152: memref<256xf32, "cuda"> {byre.arg_alias_index = 43 : i64, byre.argname = "Output29", byre.argtype = 2 : i32}, %arg153: memref<256xf32, "cuda"> {byre.arg_alias_index = 44 : i64, byre.argname = "Output30", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.arg_alias_index = 47 : i64, byre.argname = "Output31", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.arg_alias_index = 48 : i64, byre.argname = "Output32", byre.argtype = 2 : i32}, %arg156: memref<512xf32, "cuda"> {byre.arg_alias_index = 49 : i64, byre.argname = "Output33", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.arg_alias_index = 50 : i64, byre.argname = "Output34", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.arg_alias_index = 54 : i64, byre.argname = "Output35", byre.argtype = 2 : i32}, %arg159: memref<512xf32, "cuda"> {byre.arg_alias_index = 55 : i64, byre.argname = "Output36", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.arg_alias_index = 56 : i64, byre.argname = "Output37", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.arg_alias_index = 57 : i64, byre.argname = "Output38", byre.argtype = 2 : i32}, %arg162: memref<512xf32, "cuda"> {byre.arg_alias_index = 58 : i64, byre.argname = "Output39", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.arg_alias_index = 59 : i64, byre.argname = "Output40", byre.argtype = 2 : i32}, %arg164: memref<64xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg165: memref<64xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg166: memref<64xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg167: memref<64xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg168: memref<64xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg169: memref<64xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg170: memref<64xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg171: memref<64xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg172: memref<64xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg173: memref<64xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg174: memref<128xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg175: memref<128xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg176: memref<128xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg177: memref<128xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg178: memref<128xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg179: memref<128xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg180: memref<128xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg181: memref<128xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg182: memref<128xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg183: memref<128xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg184: memref<256xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg185: memref<256xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}, %arg186: memref<256xf32, "cuda"> {byre.argname = "Output63", byre.argtype = 2 : i32}, %arg187: memref<256xf32, "cuda"> {byre.argname = "Output64", byre.argtype = 2 : i32}, %arg188: memref<256xf32, "cuda"> {byre.argname = "Output65", byre.argtype = 2 : i32}, %arg189: memref<256xf32, "cuda"> {byre.argname = "Output66", byre.argtype = 2 : i32}, %arg190: memref<256xf32, "cuda"> {byre.argname = "Output67", byre.argtype = 2 : i32}, %arg191: memref<256xf32, "cuda"> {byre.argname = "Output68", byre.argtype = 2 : i32}, %arg192: memref<256xf32, "cuda"> {byre.argname = "Output69", byre.argtype = 2 : i32}, %arg193: memref<256xf32, "cuda"> {byre.argname = "Output70", byre.argtype = 2 : i32}, %arg194: memref<512xf32, "cuda"> {byre.argname = "Output71", byre.argtype = 2 : i32}, %arg195: memref<512xf32, "cuda"> {byre.argname = "Output72", byre.argtype = 2 : i32}, %arg196: memref<512xf32, "cuda"> {byre.argname = "Output73", byre.argtype = 2 : i32}, %arg197: memref<512xf32, "cuda"> {byre.argname = "Output74", byre.argtype = 2 : i32}, %arg198: memref<512xf32, "cuda"> {byre.argname = "Output75", byre.argtype = 2 : i32}, %arg199: memref<512xf32, "cuda"> {byre.argname = "Output76", byre.argtype = 2 : i32}, %arg200: memref<512xf32, "cuda"> {byre.argname = "Output77", byre.argtype = 2 : i32}, %arg201: memref<512xf32, "cuda"> {byre.argname = "Output78", byre.argtype = 2 : i32}, %arg202: memref<512xf32, "cuda"> {byre.argname = "Output79", byre.argtype = 2 : i32}, %arg203: memref<512xf32, "cuda"> {byre.argname = "Output80", byre.argtype = 2 : i32}, %arg204: memref<64x3x7x7xf16, "cuda"> {byre.argname = "Output81", byre.argtype = 2 : i32}, %arg205: memref<1x3x224x224xf16, "cuda"> {byre.argname = "Output82", byre.argtype = 2 : i32}, %arg206: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output83", byre.argtype = 2 : i32}, %arg207: memref<1x64x112x112xf16, "cuda"> {byre.argname = "Output84", byre.argtype = 2 : i32}, %arg208: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output85", byre.argtype = 2 : i32}, %arg209: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output86", byre.argtype = 2 : i32}, %arg210: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output87", byre.argtype = 2 : i32}, %arg211: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output88", byre.argtype = 2 : i32}, %arg212: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output89", byre.argtype = 2 : i32}, %arg213: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output90", byre.argtype = 2 : i32}, %arg214: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output91", byre.argtype = 2 : i32}, %arg215: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output92", byre.argtype = 2 : i32}, %arg216: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output93", byre.argtype = 2 : i32}, %arg217: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output94", byre.argtype = 2 : i32}, %arg218: memref<64x64x3x3xf16, "cuda"> {byre.argname = "Output95", byre.argtype = 2 : i32}, %arg219: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output96", byre.argtype = 2 : i32}, %arg220: memref<1x64x56x56xf16, "cuda"> {byre.argname = "Output97", byre.argtype = 2 : i32}, %arg221: memref<128x64x3x3xf16, "cuda"> {byre.argname = "Output98", byre.argtype = 2 : i32}, %arg222: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output99", byre.argtype = 2 : i32}, %arg223: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output100", byre.argtype = 2 : i32}, %arg224: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output101", byre.argtype = 2 : i32}, %arg225: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output102", byre.argtype = 2 : i32}, %arg226: memref<128x64x1x1xf16, "cuda"> {byre.argname = "Output103", byre.argtype = 2 : i32}, %arg227: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output104", byre.argtype = 2 : i32}, %arg228: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output105", byre.argtype = 2 : i32}, %arg229: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output106", byre.argtype = 2 : i32}, %arg230: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output107", byre.argtype = 2 : i32}, %arg231: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output108", byre.argtype = 2 : i32}, %arg232: memref<128x128x3x3xf16, "cuda"> {byre.argname = "Output109", byre.argtype = 2 : i32}, %arg233: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output110", byre.argtype = 2 : i32}, %arg234: memref<1x128x28x28xf16, "cuda"> {byre.argname = "Output111", byre.argtype = 2 : i32}, %arg235: memref<256x128x3x3xf16, "cuda"> {byre.argname = "Output112", byre.argtype = 2 : i32}, %arg236: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output113", byre.argtype = 2 : i32}, %arg237: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output114", byre.argtype = 2 : i32}, %arg238: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output115", byre.argtype = 2 : i32}, %arg239: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output116", byre.argtype = 2 : i32}, %arg240: memref<256x128x1x1xf16, "cuda"> {byre.argname = "Output117", byre.argtype = 2 : i32}, %arg241: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output118", byre.argtype = 2 : i32}, %arg242: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output119", byre.argtype = 2 : i32}, %arg243: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output120", byre.argtype = 2 : i32}, %arg244: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output121", byre.argtype = 2 : i32}, %arg245: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output122", byre.argtype = 2 : i32}, %arg246: memref<256x256x3x3xf16, "cuda"> {byre.argname = "Output123", byre.argtype = 2 : i32}, %arg247: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output124", byre.argtype = 2 : i32}, %arg248: memref<1x256x14x14xf16, "cuda"> {byre.argname = "Output125", byre.argtype = 2 : i32}, %arg249: memref<512x256x3x3xf16, "cuda"> {byre.argname = "Output126", byre.argtype = 2 : i32}, %arg250: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output127", byre.argtype = 2 : i32}, %arg251: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output128", byre.argtype = 2 : i32}, %arg252: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output129", byre.argtype = 2 : i32}, %arg253: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output130", byre.argtype = 2 : i32}, %arg254: memref<512x256x1x1xf16, "cuda"> {byre.argname = "Output131", byre.argtype = 2 : i32}, %arg255: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output132", byre.argtype = 2 : i32}, %arg256: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output133", byre.argtype = 2 : i32}, %arg257: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output134", byre.argtype = 2 : i32}, %arg258: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output135", byre.argtype = 2 : i32}, %arg259: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output136", byre.argtype = 2 : i32}, %arg260: memref<512x512x3x3xf16, "cuda"> {byre.argname = "Output137", byre.argtype = 2 : i32}, %arg261: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output138", byre.argtype = 2 : i32}, %arg262: memref<1x512x7x7xf16, "cuda"> {byre.argname = "Output139", byre.argtype = 2 : i32}, %arg263: memref<1x512xf16, "cuda"> {byre.argname = "Output140", byre.argtype = 2 : i32}, %arg264: memref<512x1000xf16, "cuda"> {byre.argname = "Output141", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
     %alloc = memref.alloc() : memref<1838592xi8, "cuda">
-    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 128 : i32, GridSize.x = 1176 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg122, %arg205) {BlockSize.x = 256 : i32, GridSize.x = 147 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<1x3x224x224xf32, "cuda">, memref<1x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %arg204) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg205, %arg204, %arg206) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
-    %0 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {device = "cuda", offset = 7424 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %2 = "byre.alias"(%alloc) {device = "cuda", offset = 7168 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x64x112x112xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 7424 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 7168 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg206, %arg1, %arg0, %0, %1, %2) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%0, %arg207) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x112x112xf16, "cuda">, memref<1x64x112x112xf16, "cuda">
     byre.compute @PoolMaxOp_f16_f16(%arg207, %arg208) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<1x64x112x112xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg9, %arg209) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg208, %arg209, %arg210) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {device = "cuda", offset = 6912 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %5 = "byre.alias"(%alloc) {device = "cuda", offset = 6656 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 6912 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 6656 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg210, %arg6, %arg5, %3, %4, %5) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg211) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg10, %arg212) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg211, %arg212, %arg213) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {device = "cuda", offset = 6400 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %7 = "byre.alias"(%alloc) {device = "cuda", offset = 6144 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6400 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6144 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg213, %arg8, %arg7, %3, %6, %7) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg208, %arg214) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg15, %arg215) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg214, %arg215, %arg216) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {device = "cuda", offset = 5888 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %9 = "byre.alias"(%alloc) {device = "cuda", offset = 5632 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5888 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 5632 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg216, %arg12, %arg11, %3, %8, %9) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg217) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg16, %arg218) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg217, %arg218, %arg219) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {device = "cuda", offset = 5376 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
-    %11 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 5376 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<64xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg219, %arg14, %arg13, %3, %10, %11) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown15", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
-    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%3, %arg214, %arg220) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">, memref<1x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%arg23, %arg226) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg226, %arg227) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {device = "cuda", offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {device = "cuda", offset = 256 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %14 = "byre.alias"(%alloc) {device = "cuda", offset = 768 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 8704 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 256 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg227, %arg25, %arg24, %12, %13, %14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg21, %arg221) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg220, %arg221, %arg222) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {device = "cuda", offset = 4864 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %17 = "byre.alias"(%alloc) {device = "cuda", offset = 1280 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x128x28x28xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 4864 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 1280 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg222, %arg18, %arg17, %15, %16, %17) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg223) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %arg224) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg223, %arg224, %arg225) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {device = "cuda", offset = 1792 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %19 = "byre.alias"(%alloc) {device = "cuda", offset = 2304 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 1792 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 2304 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg225, %arg20, %arg19, %15, %18, %19) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %12, %arg228) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg30, %arg229) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg228, %arg229, %arg230) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {device = "cuda", offset = 2816 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %21 = "byre.alias"(%alloc) {device = "cuda", offset = 3328 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 2816 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 3328 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg230, %arg27, %arg26, %15, %20, %21) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown27", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg231) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg31, %arg232) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg231, %arg232, %arg233) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {device = "cuda", offset = 3840 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
-    %23 = "byre.alias"(%alloc) {device = "cuda", offset = 4352 : i64} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 3840 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 4352 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<128xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg233, %arg29, %arg28, %15, %22, %23) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown29", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%15, %arg228, %arg234) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">, memref<1x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%arg38, %arg240) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg240, %arg241) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {device = "cuda", offset = 223744 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %26 = "byre.alias"(%alloc) {device = "cuda", offset = 1836544 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 223744 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 1836544 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg241, %arg40, %arg39, %24, %25, %26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg36, %arg235) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg234, %arg235, %arg236) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {device = "cuda", offset = 325120 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
-    %28 = "byre.alias"(%alloc) {device = "cuda", offset = 1835520 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %29 = "byre.alias"(%alloc) {device = "cuda", offset = 1834496 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 325120 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x256x14x14xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 1835520 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 1834496 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg236, %arg33, %arg32, %27, %28, %29) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %arg237) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %arg238) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg237, %arg238, %arg239) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {device = "cuda", offset = 1833472 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %31 = "byre.alias"(%alloc) {device = "cuda", offset = 1837568 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 1833472 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 1837568 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg239, %arg35, %arg34, %27, %30, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown38", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%27, %24, %arg242) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg45, %arg243) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg242, %arg243, %arg244) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {device = "cuda", offset = 1832448 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %33 = "byre.alias"(%alloc) {device = "cuda", offset = 1831424 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 1832448 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 1831424 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg244, %arg42, %arg41, %24, %32, %33) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown40", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown41", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg245) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown34", memory_effects = [1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg46, %arg246) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg245, %arg246, %arg247) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {device = "cuda", offset = 1830400 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
-    %35 = "byre.alias"(%alloc) {device = "cuda", offset = 7680 : i64} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 1830400 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 7680 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<256xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg247, %arg44, %arg43, %24, %34, %35) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 128 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown43", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%24, %arg242, %arg248) {BlockSize.x = 256 : i32, GridSize.x = 49 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">, memref<1x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%arg53, %arg254) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg254, %arg255) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %36 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {device = "cuda", offset = 8704 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %38 = "byre.alias"(%alloc) {device = "cuda", offset = 209408 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 8704 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 209408 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg255, %arg55, %arg54, %36, %37, %38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg51, %arg249) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg248, %arg249, %arg250) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<1x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {device = "cuda", offset = 274944 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
-    %40 = "byre.alias"(%alloc) {device = "cuda", offset = 12800 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %41 = "byre.alias"(%alloc) {device = "cuda", offset = 10752 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 274944 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512x7x7xf16, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 12800 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10752 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg250, %arg48, %arg47, %39, %40, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %arg251) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %arg252) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg251, %arg252, %arg253) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {device = "cuda", offset = 211456 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %43 = "byre.alias"(%alloc) {device = "cuda", offset = 213504 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 211456 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 213504 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg253, %arg50, %arg49, %39, %42, %43) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown52", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %36, %arg256) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg60, %arg257) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg256, %arg257, %arg258) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {device = "cuda", offset = 215552 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %45 = "byre.alias"(%alloc) {device = "cuda", offset = 217600 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 215552 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 217600 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg258, %arg57, %arg56, %36, %44, %45) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown54", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%36, %arg259) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg61, %arg260) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown49", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%arg259, %arg260, %arg261) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<1x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {device = "cuda", offset = 219648 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
-    %47 = "byre.alias"(%alloc) {device = "cuda", offset = 221696 : i64} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 219648 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 221696 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf32, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16f32f32(%arg261, %arg59, %arg58, %36, %46, %47) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 128 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%arg262, %48) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512xf16, "cuda">
-    byre.compute @PTXOp(%48, %arg263) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown58", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
-    %49 = "byre.alias"(%alloc) {device = "cuda", offset = 224768 : i64} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg4, %49) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @TransposeOp_f16_f16(%49, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {device = "cuda", offset = 14848 : i64} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%arg263, %49, %50) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg3, %50, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
-    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown65", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown67", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown71", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown73", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown75", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown77", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown79", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown80", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown81", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown84", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown85", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown86", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown88", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 128 : i32, GridSize.x = 2 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown90", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown94", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown96", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown97", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown98", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown99", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 128 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown100", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%36, %arg256, %arg262) {BlockSize.x = 256 : i32, GridSize.x = 25 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown51", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">, memref<1x512x7x7xf16, "cuda">
+    %48 = "byre.alias"(%arg262) <{offset = 0 : i64}> {device = "cuda"} : (memref<1x512x7x7xf16, "cuda">) -> memref<512x49xf16, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<512xf16, "cuda">
+    byre.compute @PTXOp(%48, %49) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 512 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown58_kernel"} : memref<512x49xf16, "cuda">, memref<512xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x512xf16, "cuda">
+    byre.compute @PTXOp(%50, %arg263) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 2 : i32]} : memref<1x512xf16, "cuda">, memref<1x512xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 224768 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg4, %51) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @TransposeOp_f16_f16(%51, %arg264) {device = "cuda", memory_effects = [1 : i32, 2 : i32], minor_to_major = dense<[0, 1]> : tensor<2xindex>, permutation = dense<[1, 0]> : tensor<2xi64>} : memref<1000x512xf16, "cuda">, memref<512x1000xf16, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 14848 : i64}> {device = "cuda"} : (memref<1838592xi8, "cuda">) -> memref<1x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%arg263, %51, %52) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<1x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg3, %52, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1x1000xf16, "cuda">, memref<1x1000xf16, "cuda">
+    byre.compute @PTXOp(%1, %arg63, %arg164) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%2, %arg64, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%4, %arg66, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%5, %arg67, %arg167) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%6, %arg69, %arg168) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%7, %arg70, %arg169) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%8, %arg72, %arg170) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%9, %arg73, %arg171) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%10, %arg75, %arg172) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%11, %arg76, %arg173) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @PTXOp(%16, %arg78, %arg174) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%17, %arg79, %arg175) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%18, %arg81, %arg176) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg82, %arg177) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%13, %arg84, %arg178) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg85, %arg179) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg87, %arg180) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg88, %arg181) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg90, %arg182) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%23, %arg91, %arg183) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @PTXOp(%28, %arg93, %arg184) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%29, %arg94, %arg185) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%30, %arg96, %arg186) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%31, %arg97, %arg187) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%25, %arg99, %arg188) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%26, %arg100, %arg189) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%32, %arg102, %arg190) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%33, %arg103, %arg191) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%34, %arg105, %arg192) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%35, %arg106, %arg193) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown82", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @PTXOp(%40, %arg108, %arg194) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%41, %arg109, %arg195) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%42, %arg111, %arg196) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%43, %arg112, %arg197) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%37, %arg114, %arg198) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%38, %arg115, %arg199) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%44, %arg117, %arg200) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%45, %arg118, %arg201) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%46, %arg120, %arg202) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @PTXOp(%47, %arg121, %arg203) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown92", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     byre.copy(%arg0, %arg124) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg1, %arg125) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
     byre.copy(%arg5, %arg126) {callee = "cuda2cuda", device = "cuda"} : memref<64xf32, "cuda">, memref<64xf32, "cuda">
diff --git a/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir b/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir
index 8d4fa513d..53f87a4e0 100644
--- a/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/2_linalg_tensor_opt.mlir
@@ -21,18 +21,6 @@ module @IrToMhlo.2452 {
     %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
     return %0 : tensor<64x64x3x3xf16>
   }
-  func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    return %0 : tensor<64x64x3x3xf16>
-  }
   func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
     return %0 : tensor<128x64x1x1xf16>
@@ -45,14 +33,6 @@ module @IrToMhlo.2452 {
     %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
     return %0 : tensor<128x128x3x3xf16>
   }
-  func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    return %0 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    return %0 : tensor<128x128x3x3xf16>
-  }
   func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
     return %0 : tensor<256x128x1x1xf16>
@@ -65,14 +45,6 @@ module @IrToMhlo.2452 {
     %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
     return %0 : tensor<256x256x3x3xf16>
   }
-  func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    return %0 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    return %0 : tensor<256x256x3x3xf16>
-  }
   func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
     return %0 : tensor<512x256x1x1xf16>
@@ -85,14 +57,6 @@ module @IrToMhlo.2452 {
     %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
     return %0 : tensor<512x512x3x3xf16>
   }
-  func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    return %0 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    return %0 : tensor<512x512x3x3xf16>
-  }
   func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<-2.500000e-01> : tensor<4x1000xf32>
     %1 = mhlo.multiply %arg0, %0 : tensor<4x1000xf32>
@@ -103,261 +67,158 @@ module @IrToMhlo.2452 {
     %0 = mhlo.convert %arg0 : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
     return %0 : tensor<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16>
+    return %0 : tensor<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
+     reducer(%arg1: tensor<f16>, %arg2: tensor<f16>)  {
+      %2 = mhlo.add %arg1, %arg2 : tensor<f16>
+      mhlo.return %2 : tensor<f16>
+    }
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x112x112xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<4x64x112x112xf16>
     %2 = mhlo.compare  GT, %1, %0 : (tensor<4x64x112x112xf16>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xi1>
     return %1, %2 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
   }
-  func.func private @BatchNormTrainingOp25(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<4x64x56x56xf16>
-    %2 = mhlo.compare  GT, %1, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1>
-    return %1, %2 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
   func.func private @BatchNormTrainingOp27(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
     %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<4x64x56x56xf16>
-    %3 = mhlo.compare  GT, %2, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1>
-    return %2, %3 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
-  func.func private @BatchNormTrainingOp29(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<4x64x56x56xf16>
     %2 = mhlo.compare  GT, %1, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1>
     return %1, %2 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @BatchNormTrainingOp31(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16>
     %2 = mhlo.maximum %1, %0 : tensor<4x64x56x56xf16>
     %3 = mhlo.compare  GT, %2, %0 : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xi1>
     return %2, %3 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @BatchNormTrainingOp33(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @BatchNormTrainingOp34(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp35(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
     %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<4x128x28x28xf16>
     %2 = mhlo.compare  GT, %1, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1>
     return %1, %2 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @BatchNormTrainingOp36(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16>
     %2 = mhlo.maximum %1, %0 : tensor<4x128x28x28xf16>
     %3 = mhlo.compare  GT, %2, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1>
     return %2, %3 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @BatchNormTrainingOp38(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<4x128x28x28xf16>
-    %2 = mhlo.compare  GT, %1, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1>
-    return %1, %2 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @BatchNormTrainingOp40(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<4x128x28x28xf16>
-    %3 = mhlo.compare  GT, %2, %0 : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xi1>
-    return %2, %3 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @BatchNormTrainingOp42(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp44(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
     %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @BatchNormTrainingOp43(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<4x256x14x14xf16>
     %2 = mhlo.compare  GT, %1, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1>
     return %1, %2 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @BatchNormTrainingOp45(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<4x256x14x14xf16>
-    %3 = mhlo.compare  GT, %2, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1>
-    return %2, %3 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @BatchNormTrainingOp47(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<4x256x14x14xf16>
-    %2 = mhlo.compare  GT, %1, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1>
-    return %1, %2 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @BatchNormTrainingOp49(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16>
     %2 = mhlo.maximum %1, %0 : tensor<4x256x14x14xf16>
     %3 = mhlo.compare  GT, %2, %0 : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xi1>
     return %2, %3 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @BatchNormTrainingOp51(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp53(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
     %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormTrainingOp52(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
     %1 = mhlo.maximum %arg0, %0 : tensor<4x512x7x7xf16>
     %2 = mhlo.compare  GT, %1, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1>
     return %1, %2 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @BatchNormTrainingOp54(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16>
     %2 = mhlo.maximum %1, %0 : tensor<4x512x7x7xf16>
     %3 = mhlo.compare  GT, %2, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1>
     return %2, %3 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @BatchNormTrainingOp56(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
-    %1 = mhlo.maximum %arg0, %0 : tensor<4x512x7x7xf16>
-    %2 = mhlo.compare  GT, %1, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1>
-    return %1, %2 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
-  }
-  func.func private @BatchNormTrainingOp58(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16>
-    %2 = mhlo.maximum %1, %0 : tensor<4x512x7x7xf16>
-    %3 = mhlo.compare  GT, %2, %0 : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xi1>
-    return %2, %3 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+  func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor<f16>) -> tensor<4x512xf16>
+     reducer(%arg1: tensor<f16>, %arg2: tensor<f16>)  {
+      %2 = mhlo.add %arg1, %arg2 : tensor<f16>
+      mhlo.return %2 : tensor<f16>
+    }
+    return %1 : tensor<4x512xf16>
   }
-  func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<2.040100e-02> : tensor<4x512xf16>
     %1 = mhlo.multiply %arg0, %0 : tensor<4x512xf16>
     return %1 : tensor<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16>
-    %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1000xf16>) -> tensor<4x1000xf16>
-    %2 = mhlo.add %arg1, %1 : tensor<4x1000xf16>
-    return %2 : tensor<4x1000xf16>
+  func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1000xf16>) -> tensor<4x1000xf16>
+    %1 = mhlo.add %arg1, %0 : tensor<4x1000xf16>
+    return %1 : tensor<4x1000xf16>
+  }
+  func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0xFC00> : tensor<f16>
+    %1 = mhlo.reduce(%arg0 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
+     reducer(%arg1: tensor<f16>, %arg2: tensor<f16>)  {
+      %2 = mhlo.maximum %arg1, %arg2 : tensor<f16>
+      mhlo.return %2 : tensor<f16>
+    }
+    return %1 : tensor<4xf16>
   }
-  func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16>
     %1 = mhlo.subtract %arg1, %0 : tensor<4x1000xf16>
-    %2 = mhlo.exponential %1 : tensor<4x1000xf16>
-    return %1, %2 : tensor<4x1000xf16>, tensor<4x1000xf16>
+    return %1 : tensor<4x1000xf16>
+  }
+  func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.exponential %arg0 : tensor<4x1000xf16>
+    %2 = mhlo.reduce(%1 init: %0) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
+     reducer(%arg1: tensor<f16>, %arg2: tensor<f16>)  {
+      %3 = mhlo.add %arg1, %arg2 : tensor<f16>
+      mhlo.return %3 : tensor<f16>
+    }
+    return %2 : tensor<4xf16>
   }
-  func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.log %arg0 : tensor<4xf16>
-    %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16>
-    %2 = mhlo.subtract %arg1, %1 : tensor<4x1000xf16>
-    %3 = mhlo.exponential %2 : tensor<4x1000xf16>
-    %4 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16>
-    %5 = mhlo.multiply %3, %4 : tensor<4x1000xf16>
-    %6 = mhlo.subtract %arg3, %5 : tensor<4x1000xf16>
-    %7 = mhlo.convert %2 : (tensor<4x1000xf16>) -> tensor<4x1000xf32>
-    %8 = mhlo.multiply %7, %arg4 : tensor<4x1000xf32>
-    %9 = mhlo.convert %6 : (tensor<4x1000xf16>) -> tensor<4x1000xf32>
-    return %6, %8, %9 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>
-  }
-  func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    return %0 : tensor<4xf16>
+  }
+  func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16>
+    %1 = mhlo.subtract %arg1, %0 : tensor<4x1000xf16>
+    %2 = mhlo.exponential %1 : tensor<4x1000xf16>
+    %3 = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf16>) -> tensor<4x1000xf16>
+    %4 = mhlo.multiply %2, %3 : tensor<4x1000xf16>
+    %5 = mhlo.subtract %arg3, %4 : tensor<4x1000xf16>
+    return %1, %5 : tensor<4x1000xf16>, tensor<4x1000xf16>
+  }
+  func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<4.900000e+01> : tensor<4x512x7x7xf16>
     %1 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
     %2 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x512xf16>) -> tensor<4x512x7x7xf16>
@@ -365,803 +226,508 @@ module @IrToMhlo.2452 {
     %4 = mhlo.select %arg1, %3, %1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>
     return %4 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp65(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
+  func.func private @BatchNormGradOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
+    %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
+    %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
   }
-  func.func private @ConvBackwardDataOp66(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
     return %2 : tensor<4x512x7x7xf16>
   }
-  func.func private @ConvBackwardFilterOp67(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
     return %1 : tensor<512x512x3x3xf16>
   }
-  func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
     %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp69(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp70(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
-    return %2 : tensor<4x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x512x7x7xf16>
     %2 = mhlo.select %arg2, %1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>
     return %2 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp74(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
-    return %2 : tensor<4x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp75(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x512x7x7xf16>
-    %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @BatchNormGradOp77(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp84(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x256x512xf16>) -> tensor<4x256x14x14xf16>
     return %2 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp79(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16>
     return %1 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormGradOp80(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp81(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp87(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,256,512]{1,0,2,3}"} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<4x256x14x14xf16>
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp82(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<1x1x256x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,1,1]{0,1,3,2}"} : (tensor<1x1x256x512xf16>) -> tensor<512x256x1x1xf16>
     return %1 : tensor<512x256x1x1xf16>
   }
-  func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16>
     %2 = mhlo.select %arg2, %1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>
     return %2 : tensor<4x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp84(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
+  func.func private @BatchNormGradOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
+    %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
+    %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
   }
-  func.func private @ConvBackwardDataOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
     return %2 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp86(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
     return %1 : tensor<256x256x3x3xf16>
   }
-  func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
     %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
-    return %2 : tensor<4x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x256x14x14xf16>
-    %2 = mhlo.select %arg2, %1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>
-    return %2 : tensor<4x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp93(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
-    return %2 : tensor<4x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp94(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x256x14x14xf16>
-    %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp96(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp97(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp103(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x128x256xf16>) -> tensor<4x128x28x28xf16>
     return %2 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp98(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16>
     return %1 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormGradOp99(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp100(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp106(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,128,256]{1,0,2,3}"} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<4x128x28x28xf16>
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp101(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<1x1x128x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,1,1]{0,1,3,2}"} : (tensor<1x1x128x256xf16>) -> tensor<256x128x1x1xf16>
     return %1 : tensor<256x128x1x1xf16>
   }
-  func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16>
     %2 = mhlo.select %arg2, %1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>
     return %2 : tensor<4x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp103(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
+  func.func private @BatchNormGradOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
+    %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
+    %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
   }
-  func.func private @ConvBackwardDataOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
     return %2 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp105(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
     return %1 : tensor<128x128x3x3xf16>
   }
-  func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
     %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
-    return %2 : tensor<4x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x128x28x28xf16>
-    %2 = mhlo.select %arg2, %1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>
-    return %2 : tensor<4x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp112(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
-    return %2 : tensor<4x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp113(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x128x28x28xf16>
-    %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp115(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp116(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp122(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x64x128xf16>) -> tensor<4x64x56x56xf16>
     return %2 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp117(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16>
     return %1 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormGradOp118(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp119(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp125(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,64,128]{1,0,2,3}"} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<4x64x56x56xf16>
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp120(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<1x1x64x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,1,1]{0,1,3,2}"} : (tensor<1x1x64x128xf16>) -> tensor<128x64x1x1xf16>
     return %1 : tensor<128x64x1x1xf16>
   }
-  func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
     %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16>
     %2 = mhlo.select %arg2, %1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>
     return %2 : tensor<4x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp122(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
+  func.func private @BatchNormGradOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
+    %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
+    %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
   }
-  func.func private @ConvBackwardDataOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
     return %2 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp124(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
     return %1 : tensor<64x64x3x3xf16>
   }
-  func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
     %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
-    %1 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16>
-    %2 = mhlo.select %arg2, %1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp131(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp132(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x56x56xf16>
-    %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp134(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp135(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp136(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.add %arg0, %arg1 : tensor<4x64x56x56xf16>
     return %0 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<4x64x112x112xf16>
     %1 = mhlo.select %arg0, %arg1, %0 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>
     return %1 : tensor<4x64x112x112xf16>
   }
-  func.func private @BatchNormGradOp139(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
-    %1 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
-    %2 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%0, %arg1, %2, %2, %1) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x112x112xf32>) -> (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
+  func.func private @BatchNormGradOp145(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
+    %1 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
+    %2 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
+    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x112x112xf32>) -> (tensor<4x64x112x112xf32>, tensor<64xf32>, tensor<64xf32>)
     %3 = mhlo.convert %grad_operand : (tensor<4x64x112x112xf32>) -> tensor<4x64x112x112xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
   }
-  func.func private @ConvBackwardFilterOp140(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp146(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[3, 2], [3, 2]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<7x7x3x64xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,3,7,7]{0,1,3,2}"} : (tensor<7x7x3x64xf16>) -> tensor<64x3x7x7xf16>
     return %1 : tensor<64x3x7x7xf16>
   }
-  func.func private @Unknown141(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor<f32> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = mhlo.convert %arg0 : (tensor<4x1000xf16>) -> tensor<4x1000xf32>
+    %2 = mhlo.multiply %1, %arg1 : tensor<4x1000xf32>
+    %3 = mhlo.reduce(%2 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<f32>
+     reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
+      %4 = mhlo.add %arg2, %arg3 : tensor<f32>
+      mhlo.return %4 : tensor<f32>
+    }
+    return %3 : tensor<f32>
+  }
+  func.func private @Unknown148(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.constant dense<4.000000e+00> : tensor<f32>
     %1 = mhlo.negate %arg0 : tensor<f32>
     %2 = mhlo.divide %1, %0 : tensor<f32>
     return %2 : tensor<f32>
   }
-  func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
     return %0 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    return %0 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
     return %0 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
     return %0 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
     return %0 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
     return %0 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    return %0 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    return %0 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
     return %0 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
     return %0 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
     return %0 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    return %0 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    return %0 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
     return %0 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
     return %0 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
     return %0 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %0 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = mhlo.convert %arg0 {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    return %0 : tensor<512x512x3x3xf32>
-  }
-  func.func private @MatmulOp162(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} {
+  func.func private @MatmulOp169(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} {
     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<512x1000xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>, xla_shape = "f16[1000,512]{0,1}"} : (tensor<512x1000xf16>) -> tensor<1000x512xf16>
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 {xla_shape = "f32[1000,512]{0,1}"} : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
     return %0 : tensor<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = mhlo.convert %arg0 : (tensor<4x1000xf16>) -> tensor<4x1000xf32>
+    %2 = mhlo.reduce(%1 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<1000xf32>
+     reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
+      %3 = mhlo.add %arg1, %arg2 : tensor<f32>
+      mhlo.return %3 : tensor<f32>
+    }
+    return %2 : tensor<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
     %0 = mhlo.convert %arg0 : (tensor<1000xf32>) -> tensor<1000xf16>
     %1 = mhlo.convert %0 : (tensor<1000xf16>) -> tensor<1000xf32>
     return %1 : tensor<1000xf32>
   }
   func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %1 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %2 = mhlo.constant dense<0xFC00> : tensor<f16>
-    %3 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16>
-    %4 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
-    %5 = mhlo.convolution(%3, %4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16>
-    %6 = call @BatchNormTrainingOp2(%5, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16>
-    %7 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %8 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %9 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %10 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %11 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
-    %12 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
-    %13 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %14 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %15 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %16 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
-    %17 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
-    %18 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %19 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %20 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %21 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
-    %22 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
-    %23 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %24 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %25 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %26 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16>
-    %27 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %28 = mhlo.reduce(%26 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %29:2 = call @Unknown24(%6) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
-    %30 = "mhlo.reduce_window"(%29#0, %2) ({
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.constant dense<0xFC00> : tensor<f16>
+    %2 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16>
+    %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
+    %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16>
+    %5 = call @BatchNormTrainingOp2(%4, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16>
+    %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
+    %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
+    %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
+    %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
+    %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
+    %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
+    %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16>
+    %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
+    %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16>
+    %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
+    %30 = "mhlo.reduce_window"(%29#0, %1) ({
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.maximum %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
+      %199 = mhlo.maximum %arg104, %arg105 : tensor<f16>
+      mhlo.return %199 : tensor<f16>
     }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<f16>) -> tensor<4x64x56x56xf16>
-    %31 = mhlo.convolution(%30, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %32 = call @BatchNormTrainingOp25(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %33:2 = call @Unknown26(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %34 = mhlo.convolution(%33#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %31 = mhlo.convolution(%30, %6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %32 = call @BatchNormTrainingOp27(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %33:2 = call @Unknown28(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %34 = mhlo.convolution(%33#0, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
     %35 = call @BatchNormTrainingOp27(%34, %arg13, %arg14) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %36:2 = call @Unknown28(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %37 = mhlo.convolution(%36#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %38 = call @BatchNormTrainingOp29(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %39:2 = call @Unknown30(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %40 = mhlo.convolution(%39#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %41 = call @BatchNormTrainingOp31(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %42:2 = call @Unknown32(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %43 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16>
-    %44 = call @BatchNormTrainingOp33(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %45 = mhlo.convolution(%42#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %46 = call @BatchNormTrainingOp34(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %47:2 = call @Unknown35(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %48 = mhlo.convolution(%47#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %49 = call @BatchNormTrainingOp36(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %50:2 = call @Unknown37(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %51 = mhlo.convolution(%50#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %52 = call @BatchNormTrainingOp38(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %53:2 = call @Unknown39(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %54 = mhlo.convolution(%53#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %55 = call @BatchNormTrainingOp40(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %56:2 = call @Unknown41(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %57 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16>
-    %58 = call @BatchNormTrainingOp42(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %59 = mhlo.convolution(%56#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %60 = call @BatchNormTrainingOp43(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %61:2 = call @Unknown44(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %62 = mhlo.convolution(%61#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %63 = call @BatchNormTrainingOp45(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %64:2 = call @Unknown46(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %65 = mhlo.convolution(%64#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %66 = call @BatchNormTrainingOp47(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %67:2 = call @Unknown48(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %68 = mhlo.convolution(%67#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %69 = call @BatchNormTrainingOp49(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %70:2 = call @Unknown50(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %71 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16>
-    %72 = call @BatchNormTrainingOp51(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %73 = mhlo.convolution(%70#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %74 = call @BatchNormTrainingOp52(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %75:2 = call @Unknown53(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %76 = mhlo.convolution(%75#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %77 = call @BatchNormTrainingOp54(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %78:2 = call @Unknown55(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %79 = mhlo.convolution(%78#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %80 = call @BatchNormTrainingOp56(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %81:2 = call @Unknown57(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %82 = mhlo.convolution(%81#0, %25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %83 = call @BatchNormTrainingOp58(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %84:2 = call @Unknown59(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %85 = mhlo.reduce(%84#0 init: %1) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor<f16>) -> tensor<4x512xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %86 = call @Unknown60(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16>
-    %87 = "mhlo.dot_general"(%86, %27) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16>
-    %88 = call @Unknown61(%arg103, %87) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
-    %89 = mhlo.reduce(%88 init: %2) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.maximum %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %90:2 = call @Unknown62(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
-    %91 = mhlo.reduce(%90#1 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %92:3 = call @Unknown63(%91, %90#0, %28, %26, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>)
-    %93 = "mhlo.dot"(%92#0, %27) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16>
-    %94 = call @Unknown64(%93, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %95:3 = call @BatchNormGradOp65(%82, %arg98, %94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %96 = call @ConvBackwardDataOp66(%95#0, %25) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %97 = call @ConvBackwardFilterOp67(%81#0, %95#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %98 = call @Unknown68(%81#1, %96) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %99:3 = call @BatchNormGradOp69(%79, %arg93, %98) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %100 = call @ConvBackwardDataOp70(%99#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %101 = call @ConvBackwardFilterOp71(%78#0, %99#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %102 = call @Unknown72(%94, %100, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %103:3 = call @BatchNormGradOp73(%76, %arg83, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %104 = call @ConvBackwardDataOp74(%103#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %105 = call @ConvBackwardFilterOp75(%75#0, %103#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %106 = call @Unknown76(%75#1, %104) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %107:3 = call @BatchNormGradOp77(%73, %arg78, %106) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %108 = call @ConvBackwardDataOp78(%107#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %109 = call @ConvBackwardFilterOp79(%70#0, %107#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16>
-    %110:3 = call @BatchNormGradOp80(%71, %arg88, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %111 = call @ConvBackwardDataOp81(%110#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16>
-    %112 = call @ConvBackwardFilterOp82(%70#0, %110#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16>
-    %113 = call @Unknown83(%111, %108, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %114:3 = call @BatchNormGradOp84(%68, %arg73, %113) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %115 = call @ConvBackwardDataOp85(%114#0, %20) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %116 = call @ConvBackwardFilterOp86(%67#0, %114#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %117 = call @Unknown87(%67#1, %115) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %118:3 = call @BatchNormGradOp88(%65, %arg68, %117) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %119 = call @ConvBackwardDataOp89(%118#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %120 = call @ConvBackwardFilterOp90(%64#0, %118#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %121 = call @Unknown91(%113, %119, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %122:3 = call @BatchNormGradOp92(%62, %arg58, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %123 = call @ConvBackwardDataOp93(%122#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %124 = call @ConvBackwardFilterOp94(%61#0, %122#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %125 = call @Unknown95(%61#1, %123) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %126:3 = call @BatchNormGradOp96(%59, %arg53, %125) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %127 = call @ConvBackwardDataOp97(%126#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %128 = call @ConvBackwardFilterOp98(%56#0, %126#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16>
-    %129:3 = call @BatchNormGradOp99(%57, %arg63, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %130 = call @ConvBackwardDataOp100(%129#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16>
-    %131 = call @ConvBackwardFilterOp101(%56#0, %129#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16>
-    %132 = call @Unknown102(%130, %127, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %133:3 = call @BatchNormGradOp103(%54, %arg48, %132) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %134 = call @ConvBackwardDataOp104(%133#0, %15) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %135 = call @ConvBackwardFilterOp105(%53#0, %133#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %136 = call @Unknown106(%53#1, %134) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %137:3 = call @BatchNormGradOp107(%51, %arg43, %136) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %138 = call @ConvBackwardDataOp108(%137#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %139 = call @ConvBackwardFilterOp109(%50#0, %137#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %140 = call @Unknown110(%132, %138, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %141:3 = call @BatchNormGradOp111(%48, %arg33, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %142 = call @ConvBackwardDataOp112(%141#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %143 = call @ConvBackwardFilterOp113(%47#0, %141#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %144 = call @Unknown114(%47#1, %142) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %145:3 = call @BatchNormGradOp115(%45, %arg28, %144) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %146 = call @ConvBackwardDataOp116(%145#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %147 = call @ConvBackwardFilterOp117(%42#0, %145#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16>
-    %148:3 = call @BatchNormGradOp118(%43, %arg38, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %149 = call @ConvBackwardDataOp119(%148#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16>
-    %150 = call @ConvBackwardFilterOp120(%42#0, %148#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16>
-    %151 = call @Unknown121(%149, %146, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %152:3 = call @BatchNormGradOp122(%40, %arg23, %151) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %153 = call @ConvBackwardDataOp123(%152#0, %10) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %154 = call @ConvBackwardFilterOp124(%39#0, %152#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %155 = call @Unknown125(%39#1, %153) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %156:3 = call @BatchNormGradOp126(%37, %arg18, %155) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %157 = call @ConvBackwardDataOp127(%156#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %158 = call @ConvBackwardFilterOp128(%36#0, %156#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %159 = call @Unknown129(%151, %157, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %160:3 = call @BatchNormGradOp130(%34, %arg13, %159) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %161 = call @ConvBackwardDataOp131(%160#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %162 = call @ConvBackwardFilterOp132(%33#0, %160#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %163 = call @Unknown133(%33#1, %161) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %164:3 = call @BatchNormGradOp134(%31, %arg8, %163) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %165 = call @ConvBackwardDataOp135(%164#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %166 = call @ConvBackwardFilterOp136(%30, %164#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %167 = call @Unknown137(%159, %165) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %168 = "mhlo.select_and_scatter"(%29#0, %167, %1) ({
+    %36:2 = call @Unknown30(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %37 = mhlo.convolution(%36#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %38 = call @BatchNormTrainingOp27(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %39:2 = call @Unknown28(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %40 = mhlo.convolution(%39#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %41 = call @BatchNormTrainingOp27(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %42:2 = call @Unknown30(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %43 = mhlo.convolution(%42#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16>
+    %44 = call @BatchNormTrainingOp35(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %45 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %46 = call @BatchNormTrainingOp35(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %47:2 = call @Unknown37(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %48 = mhlo.convolution(%47#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %49 = call @BatchNormTrainingOp35(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %50:2 = call @Unknown39(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %51 = mhlo.convolution(%50#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %52 = call @BatchNormTrainingOp35(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %53:2 = call @Unknown37(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %54 = mhlo.convolution(%53#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %55 = call @BatchNormTrainingOp35(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %56:2 = call @Unknown39(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %57 = mhlo.convolution(%56#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16>
+    %58 = call @BatchNormTrainingOp44(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %59 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %60 = call @BatchNormTrainingOp44(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %61:2 = call @Unknown46(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %62 = mhlo.convolution(%61#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %63 = call @BatchNormTrainingOp44(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %64:2 = call @Unknown48(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %65 = mhlo.convolution(%64#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %66 = call @BatchNormTrainingOp44(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %67:2 = call @Unknown46(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %68 = mhlo.convolution(%67#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %69 = call @BatchNormTrainingOp44(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %70:2 = call @Unknown48(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %71 = mhlo.convolution(%70#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16>
+    %72 = call @BatchNormTrainingOp53(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %73 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %74 = call @BatchNormTrainingOp53(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %75:2 = call @Unknown55(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %76 = mhlo.convolution(%75#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %77 = call @BatchNormTrainingOp53(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %78:2 = call @Unknown57(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %79 = mhlo.convolution(%78#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %80 = call @BatchNormTrainingOp53(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %81:2 = call @Unknown55(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %82 = mhlo.convolution(%81#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %83 = call @BatchNormTrainingOp53(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %84:2 = call @Unknown57(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %85 = call @Unknown62(%84#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16>
+    %86 = call @Unknown63(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16>
+    %87 = "mhlo.dot_general"(%86, %26) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16>
+    %88 = call @Unknown64(%27, %87) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %89 = call @Unknown65(%88) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %90 = call @Unknown66(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %91 = call @Unknown67(%90) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %92 = call @Unknown68(%91) : (tensor<4xf16>) -> tensor<4xf16>
+    %93:2 = call @Unknown69(%92, %90, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
+    %94 = "mhlo.dot"(%93#1, %26) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16>
+    %95 = call @Unknown70(%94, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %96:3 = call @BatchNormGradOp71(%82, %arg98, %95) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %97 = call @ConvBackwardDataOp72(%96#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %98 = call @ConvBackwardFilterOp73(%81#0, %96#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %99 = call @Unknown74(%81#1, %97) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %100:3 = call @BatchNormGradOp71(%79, %arg93, %99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %101 = call @ConvBackwardDataOp72(%100#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %102 = call @ConvBackwardFilterOp73(%78#0, %100#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %103 = call @Unknown78(%95, %101, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %104:3 = call @BatchNormGradOp71(%76, %arg83, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %105 = call @ConvBackwardDataOp72(%104#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %106 = call @ConvBackwardFilterOp73(%75#0, %104#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %107 = call @Unknown74(%75#1, %105) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %108:3 = call @BatchNormGradOp71(%73, %arg78, %107) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %109 = call @ConvBackwardDataOp84(%108#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %110 = call @ConvBackwardFilterOp85(%70#0, %108#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16>
+    %111:3 = call @BatchNormGradOp71(%71, %arg88, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %112 = call @ConvBackwardDataOp87(%111#0, %20) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16>
+    %113 = call @ConvBackwardFilterOp88(%70#0, %111#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16>
+    %114 = call @Unknown89(%112, %109, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %115:3 = call @BatchNormGradOp90(%68, %arg73, %114) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %116 = call @ConvBackwardDataOp91(%115#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %117 = call @ConvBackwardFilterOp92(%67#0, %115#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %118 = call @Unknown93(%67#1, %116) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %119:3 = call @BatchNormGradOp90(%65, %arg68, %118) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %120 = call @ConvBackwardDataOp91(%119#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %121 = call @ConvBackwardFilterOp92(%64#0, %119#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %122 = call @Unknown89(%114, %120, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %123:3 = call @BatchNormGradOp90(%62, %arg58, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %124 = call @ConvBackwardDataOp91(%123#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %125 = call @ConvBackwardFilterOp92(%61#0, %123#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %126 = call @Unknown93(%61#1, %124) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %127:3 = call @BatchNormGradOp90(%59, %arg53, %126) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %128 = call @ConvBackwardDataOp103(%127#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %129 = call @ConvBackwardFilterOp104(%56#0, %127#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16>
+    %130:3 = call @BatchNormGradOp90(%57, %arg63, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %131 = call @ConvBackwardDataOp106(%130#0, %15) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16>
+    %132 = call @ConvBackwardFilterOp107(%56#0, %130#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16>
+    %133 = call @Unknown108(%131, %128, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %134:3 = call @BatchNormGradOp109(%54, %arg48, %133) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %135 = call @ConvBackwardDataOp110(%134#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %136 = call @ConvBackwardFilterOp111(%53#0, %134#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %137 = call @Unknown112(%53#1, %135) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %138:3 = call @BatchNormGradOp109(%51, %arg43, %137) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %139 = call @ConvBackwardDataOp110(%138#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %140 = call @ConvBackwardFilterOp111(%50#0, %138#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %141 = call @Unknown108(%133, %139, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %142:3 = call @BatchNormGradOp109(%48, %arg33, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %143 = call @ConvBackwardDataOp110(%142#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %144 = call @ConvBackwardFilterOp111(%47#0, %142#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %145 = call @Unknown112(%47#1, %143) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %146:3 = call @BatchNormGradOp109(%45, %arg28, %145) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %147 = call @ConvBackwardDataOp122(%146#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %148 = call @ConvBackwardFilterOp123(%42#0, %146#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16>
+    %149:3 = call @BatchNormGradOp109(%43, %arg38, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %150 = call @ConvBackwardDataOp125(%149#0, %10) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16>
+    %151 = call @ConvBackwardFilterOp126(%42#0, %149#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16>
+    %152 = call @Unknown127(%150, %147, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %153:3 = call @BatchNormGradOp128(%40, %arg23, %152) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %154 = call @ConvBackwardDataOp129(%153#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %155 = call @ConvBackwardFilterOp130(%39#0, %153#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %156 = call @Unknown131(%39#1, %154) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %157:3 = call @BatchNormGradOp128(%37, %arg18, %156) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %158 = call @ConvBackwardDataOp129(%157#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %159 = call @ConvBackwardFilterOp130(%36#0, %157#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %160 = call @Unknown127(%152, %158, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %161:3 = call @BatchNormGradOp128(%34, %arg13, %160) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %162 = call @ConvBackwardDataOp129(%161#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %163 = call @ConvBackwardFilterOp130(%33#0, %161#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %164 = call @Unknown131(%33#1, %162) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %165:3 = call @BatchNormGradOp128(%31, %arg8, %164) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %166 = call @ConvBackwardDataOp129(%165#0, %6) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %167 = call @ConvBackwardFilterOp130(%30, %165#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %168 = call @Unknown143(%160, %166) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %169 = "mhlo.select_and_scatter"(%29#0, %168, %0) ({
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.compare  GE, %arg104, %arg105 : (tensor<f16>, tensor<f16>) -> tensor<i1>
-      mhlo.return %198 : tensor<i1>
+      %199 = mhlo.compare  GE, %arg104, %arg105 : (tensor<f16>, tensor<f16>) -> tensor<i1>
+      mhlo.return %199 : tensor<i1>
     }, {
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
+      %199 = mhlo.add %arg104, %arg105 : tensor<f16>
+      mhlo.return %199 : tensor<f16>
     }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>, tensor<f16>) -> tensor<4x64x112x112xf16>
-    %169 = call @Unknown138(%29#1, %168) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
-    %170:3 = call @BatchNormGradOp139(%5, %arg3, %169) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %171 = call @ConvBackwardFilterOp140(%3, %170#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16>
-    %172 = mhlo.reduce(%92#1 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<f32>
-     reducer(%arg104: tensor<f32>, %arg105: tensor<f32>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f32>
-      mhlo.return %198 : tensor<f32>
-    }
-    %173 = call @Unknown141(%172) : (tensor<f32>) -> tensor<f32>
-    %174 = call @Unknown142(%171) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %175 = call @Unknown143(%166) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %176 = call @Unknown144(%162) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %177 = call @Unknown145(%158) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %178 = call @Unknown146(%154) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %179 = call @Unknown147(%147) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %180 = call @Unknown148(%143) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %181 = call @Unknown149(%150) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %182 = call @Unknown150(%139) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %183 = call @Unknown151(%135) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %184 = call @Unknown152(%128) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %185 = call @Unknown153(%124) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %186 = call @Unknown154(%131) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %187 = call @Unknown155(%120) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %188 = call @Unknown156(%116) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %189 = call @Unknown157(%109) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %190 = call @Unknown158(%105) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %191 = call @Unknown159(%112) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %192 = call @Unknown160(%101) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %193 = call @Unknown161(%97) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %194 = call @MatmulOp162(%86, %92#0) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16>
-    %195 = call @Unknown163(%194) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %196 = mhlo.reduce(%92#2 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<1000xf32>
-     reducer(%arg104: tensor<f32>, %arg105: tensor<f32>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f32>
-      mhlo.return %198 : tensor<f32>
-    }
-    %197 = call @Unknown164(%196) : (tensor<1000xf32>) -> tensor<1000xf32>
-    return %173, %174, %170#1, %170#2, %175, %164#1, %164#2, %176, %160#1, %160#2, %177, %156#1, %156#2, %178, %152#1, %152#2, %179, %145#1, %145#2, %180, %141#1, %141#2, %181, %148#1, %148#2, %182, %137#1, %137#2, %183, %133#1, %133#2, %184, %126#1, %126#2, %185, %122#1, %122#2, %186, %129#1, %129#2, %187, %118#1, %118#2, %188, %114#1, %114#2, %189, %107#1, %107#2, %190, %103#1, %103#2, %191, %110#1, %110#2, %192, %99#1, %99#2, %193, %95#1, %95#2, %195, %197 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
+    %170 = call @Unknown144(%29#1, %169) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
+    %171:3 = call @BatchNormGradOp145(%4, %arg3, %170) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %172 = call @ConvBackwardFilterOp146(%2, %171#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16>
+    %173 = call @Unknown147(%93#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor<f32>
+    %174 = call @Unknown148(%173) : (tensor<f32>) -> tensor<f32>
+    %175 = call @Unknown149(%172) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
+    %176 = call @Unknown150(%167) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %177 = call @Unknown150(%163) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %178 = call @Unknown150(%159) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %179 = call @Unknown150(%155) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %180 = call @Unknown154(%148) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %181 = call @Unknown155(%144) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %182 = call @Unknown156(%151) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %183 = call @Unknown155(%140) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %184 = call @Unknown155(%136) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %185 = call @Unknown159(%129) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %186 = call @Unknown160(%125) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %187 = call @Unknown161(%132) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %188 = call @Unknown160(%121) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %189 = call @Unknown160(%117) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %190 = call @Unknown164(%110) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %191 = call @Unknown165(%106) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %192 = call @Unknown166(%113) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %193 = call @Unknown165(%102) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %194 = call @Unknown165(%98) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %195 = call @MatmulOp169(%86, %93#1) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16>
+    %196 = call @Unknown170(%195) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %197 = call @Unknown171(%93#1) : (tensor<4x1000xf16>) -> tensor<1000xf32>
+    %198 = call @Unknown172(%197) : (tensor<1000xf32>) -> tensor<1000xf32>
+    return %174, %175, %171#1, %171#2, %176, %165#1, %165#2, %177, %161#1, %161#2, %178, %157#1, %157#2, %179, %153#1, %153#2, %180, %146#1, %146#2, %181, %142#1, %142#2, %182, %149#1, %149#2, %183, %138#1, %138#2, %184, %134#1, %134#2, %185, %127#1, %127#2, %186, %123#1, %123#2, %187, %130#1, %130#2, %188, %119#1, %119#2, %189, %115#1, %115#2, %190, %108#1, %108#2, %191, %104#1, %104#2, %192, %111#1, %111#2, %193, %100#1, %100#2, %194, %96#1, %96#2, %196, %198 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir b/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir
index d7013596c..527c5f7c3 100644
--- a/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/3_byre_tensor_opt.mlir
@@ -2,30 +2,78 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map5 = affine_map<() -> ()>
-#map6 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)>
+#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
+#map6 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map8 = affine_map<(d0) -> (d0 mod 128, 125)>
+#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)>
+#map10 = affine_map<(d0) -> (d0 * 32)>
+#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)>
+#map12 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)>
+#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)>
+#map14 = affine_map<(d0)[s0] -> (d0 * 32 + s0)>
 module @IrToMhlo.2452 {
   func.func private @Unknown0(%arg0: tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c224 = arith.constant 224 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x3x224x224xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x3x224x224xf32>) outs(%0 : tensor<4x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<4x3x224x224xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x3x224x224xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x3x224x224xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x3x224x224xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c224 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x3x224x224xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x3x224x224xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x3x224x224xf16>
+            scf.yield %inserted_slice : tensor<4x3x224x224xf16>
+          }
+          scf.yield %4 : tensor<4x3x224x224xf16>
+        }
+        scf.yield %3 : tensor<4x3x224x224xf16>
+      }
+      scf.yield %2 : tensor<4x3x224x224xf16>
+    }
     return %1 : tensor<4x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x3x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x3x7x7xf16>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf16>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf16>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf16>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf16>
+    }
     return %1 : tensor<64x3x7x7xf16>
   }
   func.func private @BatchNormTrainingOp2(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x112x112xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
@@ -35,611 +83,1420 @@ module @IrToMhlo.2452 {
     return %1 : tensor<4x64x112x112xf16>
   }
   func.func private @Unknown3(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf16>
+    }
     return %1 : tensor<64x64x3x3xf16>
   }
   func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x1x1xf16>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf16>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf16>
+    }
     return %1 : tensor<128x64x1x1xf16>
   }
   func.func private @Unknown8(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf16>
+    }
     return %1 : tensor<128x64x3x3xf16>
   }
   func.func private @Unknown9(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf16>
+    }
     return %1 : tensor<128x128x3x3xf16>
   }
   func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x1x1xf16>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf16>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf16>
+    }
     return %1 : tensor<256x128x1x1xf16>
   }
   func.func private @Unknown13(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf16>
+    }
     return %1 : tensor<256x128x3x3xf16>
   }
   func.func private @Unknown14(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf16>
+    }
     return %1 : tensor<256x256x3x3xf16>
   }
   func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x1x1xf16>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf16>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf16>
+    }
     return %1 : tensor<512x256x1x1xf16>
   }
   func.func private @Unknown18(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf16>
+    }
     return %1 : tensor<512x256x3x3xf16>
   }
   func.func private @Unknown19(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x512x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf16>
+    }
     return %1 : tensor<512x512x3x3xf16>
   }
   func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant -2.500000e-01 : f32
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x1000xf32>) outs(%0 : tensor<4x1000xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.mulf %in, %cst : f32
-      %3 = arith.truncf %2 : f32 to f16
-      linalg.yield %3 : f16
-    } -> tensor<4x1000xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c1000 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x1000xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.mulf %in, %cst : f32
+          %6 = arith.truncf %5 : f32 to f16
+          linalg.yield %6 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
     return %1 : tensor<4x1000xf16>
   }
   func.func private @Unknown23(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1000x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<1000x512xf16>
+        scf.yield %inserted_slice : tensor<1000x512xf16>
+      }
+      scf.yield %2 : tensor<1000x512xf16>
+    }
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<1000xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f32, %out: f16):
+        %4 = arith.truncf %in : f32 to f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<1000xf16>
+      scf.yield %inserted_slice : tensor<1000xf16>
+    }
+    return %1 : tensor<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %26 = arith.addf %25, %cst : f16
+        %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %27 = arith.cmpi ugt, %dim_11, %c1 : index
+        %28 = scf.if %27 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %29 = arith.addf %26, %28 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %29 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = arith.addf %extracted, %cst : f16
+        %23 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%23] : tensor<2xf16>
+        %24 = arith.addf %extracted_9, %22 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor<f16> to tensor<f16>
+        %inserted = tensor.insert %24 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x112x112xf16>
     %1 = tensor.empty() : tensor<4x64x112x112xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x112x112xf16>) outs(%0, %1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c112 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x112x112xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x112x112xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
   }
-  func.func private @BatchNormTrainingOp25(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
   func.func private @BatchNormTrainingOp27(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
     %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
-  func.func private @BatchNormTrainingOp29(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
     %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c56 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x56x56xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @BatchNormTrainingOp31(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>) -> tensor<4x64x56x56xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %1 = mhlo.convert %output : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
     %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c56 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x56x56xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @BatchNormTrainingOp33(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp35(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
     %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @BatchNormTrainingOp34(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
     %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @BatchNormTrainingOp36(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @BatchNormTrainingOp38(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c28 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x128x28x28xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+    }
     return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @BatchNormTrainingOp40(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> tensor<4x128x28x28xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %1 = mhlo.convert %output : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
     %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c28 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x128x28x28xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+    }
     return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @BatchNormTrainingOp42(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @BatchNormTrainingOp43(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @BatchNormTrainingOp45(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @BatchNormTrainingOp47(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp44(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
     %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
     %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c14 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x256x14x14xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+    }
     return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @BatchNormTrainingOp49(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<256xf32>) -> tensor<4x256x14x14xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %1 = mhlo.convert %output : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
     %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c14 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x256x14x14xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+    }
     return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @BatchNormTrainingOp51(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
+  func.func private @BatchNormTrainingOp53(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
     %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
     %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
     %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormTrainingOp52(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
     %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
-  }
-  func.func private @BatchNormTrainingOp54(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c7 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x512x7x7xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+    }
     return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @BatchNormTrainingOp56(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
     %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x512x7x7xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+    }
     return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @BatchNormTrainingOp58(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<512xf32>) -> tensor<4x512x7x7xf16> attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormTrainingOp"} {
-    %0 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %output, %batch_mean, %batch_var = "mhlo.batch_norm_training"(%0, %arg1, %arg2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %1 = mhlo.convert %output : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
-  }
-  func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<4x512x7x7xf16> into tensor<2048x49xf16>
+    %0 = tensor.empty() : tensor<2048xf16>
+    %1 = scf.forall (%arg1) in (2048) shared_outs(%arg2 = %0) -> (tensor<2048xf16>) {
+      %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<2048x49xf16> to tensor<49xf16>
+      %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16>
+      %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<2048xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) {
+        %15 = affine.min #map6(%arg3)
+        %16 = affine.min #map7(%arg3)
+        %17 = affine.apply #map3(%16, %15)
+        %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor<?xf16>
+        %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %expanded_8, %c1 : tensor<1x?xf16>
+        %18 = arith.cmpi ugt, %dim, %c0 : index
+        %19 = scf.if %18 -> (f16) {
+          %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %20 = arith.addf %19, %cst : f16
+        %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor<f16>) {
+        %15 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %13[%15] : tensor<2xf16>
+        %16 = arith.addf %extracted, %cst : f16
+        %17 = affine.apply #map5(%arg3)
+        %extracted_7 = tensor.extract %13[%17] : tensor<2xf16>
+        %18 = arith.addf %extracted_7, %16 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[] [] [] : tensor<f16> to tensor<f16>
+        %inserted = tensor.insert %18 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<2048xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<2048xf16> into tensor<4x512xf16>
+    return %expanded : tensor<4x512xf16>
+  }
+  func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 2.040100e-02 : f16
     %0 = tensor.empty() : tensor<4x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x512xf16>) outs(%0 : tensor<4x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.mulf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %out: f16):
+          %5 = arith.mulf %in, %cst : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<4x512xf16>
+        scf.yield %inserted_slice : tensor<4x512xf16>
+      }
+      scf.yield %2 : tensor<4x512xf16>
+    }
     return %1 : tensor<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<1000xf32>) outs(%0 : tensor<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %2 = arith.truncf %in_0 : f32 to f16
-      %3 = arith.addf %in, %2 : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<1000xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %in_1: f16, %out: f16):
+          %5 = arith.addf %in_1, %in : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
     return %1 : tensor<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %26 = arith.cmpi ugt, %dim_11, %c1 : index
+        %27 = scf.if %26 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %28 = arith.maximumf %25, %27 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %28 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%22] : tensor<2xf16>
+        %23 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor<f16> to tensor<f16>
+        %inserted = tensor.insert %23 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1:2 = linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<4xf16>) outs(%0, %0 : tensor<4x1000xf16>, tensor<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16):
-      %2 = arith.subf %in, %in_0 : f16
-      %3 = math.exp %2 : f16
-      linalg.yield %2, %3 : f16, f16
-    } -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
-    return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %in_1: f16, %out: f16):
+          %5 = arith.subf %in_1, %in : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
+    return %1 : tensor<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = tensor.empty() : tensor<4x1000xf32>
-    %2:3 = linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : tensor<4x1000xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4xf16>, tensor<4x1000xf32>) outs(%0, %1, %1 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %in_3: f32, %out: f16, %out_4: f32, %out_5: f32):
-      %3 = math.log %in_1 : f16
-      %4 = arith.subf %in_0, %3 : f16
-      %5 = math.exp %4 : f16
-      %6 = arith.mulf %5, %in_2 : f16
-      %7 = arith.subf %in, %6 : f16
-      %8 = arith.extf %4 : f16 to f32
-      %9 = arith.mulf %8, %in_3 : f32
-      %10 = arith.extf %7 : f16 to f32
-      linalg.yield %7, %9, %10 : f16, f32, f32
-    } -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>)
-    return %2#0, %2#1, %2#2 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>
-  }
-  func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %cst = arith.constant 0.000000e+00 : f16
-    %cst_0 = arith.constant 4.900000e+01 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x512x7x7xi1>, tensor<4x512xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_1: f16, %out: f16):
-      %2 = arith.divf %in_1, %cst_0 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @BatchNormGradOp65(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp66(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
-    return %2 : tensor<4x512x7x7xf16>
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %26 = math.exp %25 : f16
+        %27 = arith.addf %26, %cst : f16
+        %dim_11 = tensor.dim %expanded_10, %c1 : tensor<1x?xf16>
+        %28 = arith.cmpi ugt, %dim_11, %c1 : index
+        %29 = scf.if %28 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %30 = math.exp %29 : f16
+        %31 = arith.addf %27, %30 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %31 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = arith.addf %extracted, %cst : f16
+        %23 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%23] : tensor<2xf16>
+        %24 = arith.addf %extracted_9, %22 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[] [] [] : tensor<f16> to tensor<f16>
+        %inserted = tensor.insert %24 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f16, %out: f16):
+        %4 = math.log %in : f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      scf.yield %inserted_slice : tensor<4xf16>
+    }
+    return %1 : tensor<4xf16>
   }
-  func.func private @ConvBackwardFilterOp67(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
+  func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<4x1000xf16>
+    %1:2 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0, %arg6 = %0) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) {
+      %2:2 = scf.for %arg7 = %c0 to %c1000 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg2[%arg4] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_1 = tensor.extract_slice %arg1[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %extracted_slice_2 = tensor.extract_slice %arg3[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %extracted_slice_2 : tensor<f16>, tensor<f16>, tensor<f16>, tensor<f16>) outs(%3, %3 : tensor<f16>, tensor<f16>) {
+        ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f16, %out: f16, %out_7: f16):
+          %5 = arith.subf %in_5, %in_4 : f16
+          %6 = math.exp %5 : f16
+          %7 = arith.mulf %6, %in : f16
+          %8 = arith.subf %in_6, %7 : f16
+          linalg.yield %5, %8 : f16, f16
+        } -> (tensor<f16>, tensor<f16>)
+        %inserted_slice = tensor.insert_slice %4#0 into %arg8[%arg4, %arg7] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        %inserted_slice_3 = tensor.insert_slice %4#1 into %arg9[%arg4, %arg7] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice, %inserted_slice_3 : tensor<4x1000xf16>, tensor<4x1000xf16>
+      }
+      scf.yield %2#0, %2#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
+    }
+    return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
   }
-  func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 4.900000e+01 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x512xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: i1, %out: f16):
+              %7 = arith.divf %in, %cst_0 : f16
+              %8 = arith.select %in_2, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp69(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+  func.func private @BatchNormGradOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
     %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
     %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
@@ -647,165 +1504,136 @@ module @IrToMhlo.2452 {
     %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
   }
-  func.func private @ConvBackwardDataOp70(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
     return %2 : tensor<4x512x7x7xf16>
   }
-  func.func private @ConvBackwardFilterOp71(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
     return %1 : tensor<512x512x3x3xf16>
   }
-  func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp73(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp74(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<512x512x3x3xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,512,512]{1,0,2,3}"} : (tensor<3x3x512x512xf16>) -> tensor<3x3x512x512xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x512x512xf16>) -> tensor<4x512x7x7xf16>
-    return %2 : tensor<4x512x7x7xf16>
-  }
-  func.func private @ConvBackwardFilterOp75(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x512x512xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,512,3,3]{0,1,3,2}"} : (tensor<3x3x512x512xf16>) -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @BatchNormGradOp77(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp84(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<512x256x3x3xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,512]{1,0,2,3}"} : (tensor<3x3x256x512xf16>) -> tensor<3x3x256x512xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<3x3x256x512xf16>) -> tensor<4x256x14x14xf16>
     return %2 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp79(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<3x3x256x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x512xf16>) -> tensor<512x256x3x3xf16>
     return %1 : tensor<512x256x3x3xf16>
   }
-  func.func private @BatchNormGradOp80(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512xf32>, %arg2: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<512xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<4x512x7x7xf32>) -> (tensor<4x512x7x7xf32>, tensor<512xf32>, tensor<512xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x512x7x7xf32>) -> tensor<4x512x7x7xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-  }
-  func.func private @ConvBackwardDataOp81(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp87(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,256,512]{1,0,2,3}"} : (tensor<512x256x1x1xf16>) -> tensor<1x1x256x512xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<1x1x256x512xf16>) -> tensor<4x256x14x14xf16>
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp82(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<1x1x256x512xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[512,256,1,1]{0,1,3,2}"} : (tensor<1x1x256x512xf16>) -> tensor<512x256x1x1xf16>
     return %1 : tensor<512x256x1x1xf16>
   }
-  func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp84(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp85(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
-    return %2 : tensor<4x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp86(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @BatchNormGradOp88(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
-    return %2 : tensor<4x256x14x14xf16>
-  }
-  func.func private @ConvBackwardFilterOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x256x14x14xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x256x14x14xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x256x14x14xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x256x14x14xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c14 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x256x14x14xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            scf.yield %inserted_slice : tensor<4x256x14x14xf16>
+          }
+          scf.yield %4 : tensor<4x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<4x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<4x256x14x14xf16>
+    }
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+  func.func private @BatchNormGradOp90(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
     %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
     %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
@@ -813,135 +1641,103 @@ module @IrToMhlo.2452 {
     %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
   }
-  func.func private @ConvBackwardDataOp93(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<256x256x3x3xf16>) -> tensor<3x3x256x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,256,256]{1,0,2,3}"} : (tensor<3x3x256x256xf16>) -> tensor<3x3x256x256xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x256x256xf16>) -> tensor<4x256x14x14xf16>
     return %2 : tensor<4x256x14x14xf16>
   }
-  func.func private @ConvBackwardFilterOp94(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp92(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x256x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,256,3,3]{0,1,3,2}"} : (tensor<3x3x256x256xf16>) -> tensor<256x256x3x3xf16>
     return %1 : tensor<256x256x3x3xf16>
   }
-  func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x256x14x14xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x256x14x14xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            scf.yield %inserted_slice : tensor<4x256x14x14xf16>
+          }
+          scf.yield %4 : tensor<4x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<4x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<4x256x14x14xf16>
+    }
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @BatchNormGradOp96(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp97(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp103(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<256x128x3x3xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,256]{1,0,2,3}"} : (tensor<3x3x128x256xf16>) -> tensor<3x3x128x256xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<3x3x128x256xf16>) -> tensor<4x128x28x28xf16>
     return %2 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp98(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<3x3x128x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x256xf16>) -> tensor<256x128x3x3xf16>
     return %1 : tensor<256x128x3x3xf16>
   }
-  func.func private @BatchNormGradOp99(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256xf32>, %arg2: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<256xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<4x256x14x14xf32>) -> (tensor<4x256x14x14xf32>, tensor<256xf32>, tensor<256xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x256x14x14xf32>) -> tensor<4x256x14x14xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-  }
-  func.func private @ConvBackwardDataOp100(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp106(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,128,256]{1,0,2,3}"} : (tensor<256x128x1x1xf16>) -> tensor<1x1x128x256xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<1x1x128x256xf16>) -> tensor<4x128x28x28xf16>
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp101(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<1x1x128x256xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[256,128,1,1]{0,1,3,2}"} : (tensor<1x1x128x256xf16>) -> tensor<256x128x1x1xf16>
     return %1 : tensor<256x128x1x1xf16>
   }
-  func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp103(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp104(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
-    return %2 : tensor<4x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp105(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @BatchNormGradOp107(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
-    return %2 : tensor<4x128x28x28xf16>
-  }
-  func.func private @ConvBackwardFilterOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x128x28x28xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x128x28x28xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x128x28x28xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x128x28x28xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c28 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x128x28x28xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            scf.yield %inserted_slice : tensor<4x128x28x28xf16>
+          }
+          scf.yield %4 : tensor<4x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<4x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<4x128x28x28xf16>
+    }
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+  func.func private @BatchNormGradOp109(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
     %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
     %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
@@ -949,135 +1745,103 @@ module @IrToMhlo.2452 {
     %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
   }
-  func.func private @ConvBackwardDataOp112(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<128x128x3x3xf16>) -> tensor<3x3x128x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,128,128]{1,0,2,3}"} : (tensor<3x3x128x128xf16>) -> tensor<3x3x128x128xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x128x128xf16>) -> tensor<4x128x28x28xf16>
     return %2 : tensor<4x128x28x28xf16>
   }
-  func.func private @ConvBackwardFilterOp113(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp111(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x128x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,128,3,3]{0,1,3,2}"} : (tensor<3x3x128x128xf16>) -> tensor<128x128x3x3xf16>
     return %1 : tensor<128x128x3x3xf16>
   }
-  func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x128x28x28xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x128x28x28xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            scf.yield %inserted_slice : tensor<4x128x28x28xf16>
+          }
+          scf.yield %4 : tensor<4x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<4x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<4x128x28x28xf16>
+    }
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @BatchNormGradOp115(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp116(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp122(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<128x64x3x3xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,128]{1,0,2,3}"} : (tensor<3x3x64x128xf16>) -> tensor<3x3x64x128xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 2], [1, 2]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<3x3x64x128xf16>) -> tensor<4x64x56x56xf16>
     return %2 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp117(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 0], [1, 0]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<3x3x64x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x128xf16>) -> tensor<128x64x3x3xf16>
     return %1 : tensor<128x64x3x3xf16>
   }
-  func.func private @BatchNormGradOp118(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128xf32>, %arg2: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<128xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<4x128x28x28xf32>) -> (tensor<4x128x28x28xf32>, tensor<128xf32>, tensor<128xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x128x28x28xf32>) -> tensor<4x128x28x28xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-  }
-  func.func private @ConvBackwardDataOp119(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp125(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[1,1,64,128]{1,0,2,3}"} : (tensor<128x64x1x1xf16>) -> tensor<1x1x64x128xf16>
     %1 = mhlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[0, 1], [0, 1]], lhs_dilate = [2, 2], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<1x1x64x128xf16>) -> tensor<4x64x56x56xf16>
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp120(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<0> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[0, -1], [0, -1]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<1x1x64x128xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[128,64,1,1]{0,1,3,2}"} : (tensor<1x1x64x128xf16>) -> tensor<128x64x1x1xf16>
     return %1 : tensor<128x64x1x1xf16>
   }
-  func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp122(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp123(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp124(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @BatchNormGradOp126(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c56 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+  func.func private @BatchNormGradOp128(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
     %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
     %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
@@ -1085,66 +1849,110 @@ module @IrToMhlo.2452 {
     %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
   }
-  func.func private @ConvBackwardDataOp131(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
+  func.func private @ConvBackwardDataOp129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
     %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
     %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
     %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
     return %2 : tensor<4x64x56x56xf16>
   }
-  func.func private @ConvBackwardFilterOp132(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp130(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
     return %1 : tensor<64x64x3x3xf16>
   }
-  func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @BatchNormGradOp134(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
-    %1 = mhlo.convert %arg0 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %2 = mhlo.convert %arg2 : (tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf32>
-    %grad_operand, %grad_scale, %grad_offset = "mhlo.batch_norm_grad"(%1, %arg1, %0, %0, %2) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} : (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<4x64x56x56xf32>) -> (tensor<4x64x56x56xf32>, tensor<64xf32>, tensor<64xf32>)
-    %3 = mhlo.convert %grad_operand : (tensor<4x64x56x56xf32>) -> tensor<4x64x56x56xf16>
-    return %3, %grad_scale, %grad_offset : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-  }
-  func.func private @ConvBackwardDataOp135(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardDataOp"} {
-    %0 = "mhlo.transpose"(%arg1) {permutation = dense<[2, 3, 1, 0]> : tensor<4xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<64x64x3x3xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.reverse"(%0) {dimensions = dense<[0, 1]> : tensor<2xi64>, xla_shape = "f16[3,3,64,64]{1,0,2,3}"} : (tensor<3x3x64x64xf16>) -> tensor<3x3x64x64xf16>
-    %2 = mhlo.convolution(%arg0, %1) dim_numbers = [b, f, 0, 1]x[0, 1, o, i]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<3x3x64x64xf16>) -> tensor<4x64x56x56xf16>
-    return %2 : tensor<4x64x56x56xf16>
-  }
-  func.func private @ConvBackwardFilterOp136(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<1> : tensor<4xi64>, __byre__window_strides = dense<1> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
-    %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<3x3x64x64xf16>
-    %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,64,3,3]{0,1,3,2}"} : (tensor<3x3x64x64xf16>) -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_1: f16, %out: f16):
+              %7 = arith.addf %in, %in_1 : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) outs(%0 : tensor<4x64x112x112xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x112x112xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x112x112xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x112x112xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x112x112xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c112 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x112x112xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x112x112xf16>
+            scf.yield %inserted_slice : tensor<4x64x112x112xf16>
+          }
+          scf.yield %4 : tensor<4x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<4x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<4x64x112x112xf16>
+    }
     return %1 : tensor<4x64x112x112xf16>
   }
-  func.func private @BatchNormGradOp139(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
+  func.func private @BatchNormGradOp145(%arg0: tensor<4x64x112x112xf16>, %arg1: tensor<64xf32>, %arg2: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) attributes {__byre__epsilon = 9.99999974E-6 : f32, __byre__feature_index = 1 : i64, byre_compute_name = "BatchNormGradOp"} {
     %0 = mhlo.constant dense<0.000000e+00> : tensor<64xf32>
     %1 = mhlo.convert %arg0 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
     %2 = mhlo.convert %arg2 : (tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf32>
@@ -1152,15 +1960,246 @@ module @IrToMhlo.2452 {
     %3 = mhlo.convert %grad_operand : (tensor<4x64x112x112xf32>) -> tensor<4x64x112x112xf16>
     return %3, %grad_scale, %grad_offset : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
   }
-  func.func private @ConvBackwardFilterOp140(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
+  func.func private @ConvBackwardFilterOp146(%arg0: tensor<4x3x224x224xf16>, %arg1: tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16> attributes {__byre__batch_group_count = 1 : i64, __byre__feature_group_count = 1 : i64, __byre__input_layout = "NCHW", __byre__kernel_layout = "NCHW", __byre__output_layout = "NCHW", __byre__padding = dense<3> : tensor<4xi64>, __byre__window_strides = dense<2> : tensor<2xi64>, byre_compute_name = "ConvBackwardFilterOp"} {
     %0 = mhlo.convolution(%arg0, %arg1) dim_numbers = [f, b, 0, 1]x[i, o, 0, 1]->[0, 1, b, f], window = {stride = [1, 1], pad = [[3, 2], [3, 2]], lhs_dilate = [1, 1], rhs_dilate = [2, 2]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<7x7x3x64xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[3, 2, 0, 1]> : tensor<4xi64>, xla_shape = "f16[64,3,7,7]{0,1,3,2}"} : (tensor<7x7x3x64xf16>) -> tensor<64x3x7x7xf16>
     return %1 : tensor<64x3x7x7xf16>
   }
-  func.func private @Unknown141(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor<f32> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<f32>
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1]] : tensor<4x1000xf16> into tensor<4000xf16>
+    %collapsed_1 = tensor.collapse_shape %arg1 [[0, 1]] : tensor<4x1000xf32> into tensor<4000xf32>
+    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<4000xf16> into tensor<32x125xf16>
+    %expanded_2 = tensor.expand_shape %collapsed_1 [[0, 1]] : tensor<4000xf32> into tensor<32x125xf32>
+    %1 = tensor.empty() : tensor<32xf32>
+    %2 = scf.forall (%arg2) in (32) shared_outs(%arg3 = %1) -> (tensor<32xf32>) {
+      %extracted_slice = tensor.extract_slice %expanded[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf16> to tensor<125xf16>
+      %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<125xf16> into tensor<1x125xf16>
+      %extracted_slice_4 = tensor.extract_slice %expanded_2[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf32> to tensor<125xf32>
+      %expanded_5 = tensor.expand_shape %extracted_slice_4 [[0, 1]] : tensor<125xf32> into tensor<1x125xf32>
+      %extracted_slice_6 = tensor.extract_slice %arg3[%arg2] [1] [1] : tensor<32xf32> to tensor<f32>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf32>
+      %5 = scf.forall (%arg4) in (128) shared_outs(%arg5 = %4) -> (tensor<128xf32>) {
+        %19 = affine.min #map8(%arg4)
+        %20 = affine.min #map9(%arg4)
+        %21 = affine.apply #map3(%20, %19)
+        %extracted_slice_13 = tensor.extract_slice %expanded_3[0, %19] [1, %21] [1, 1] : tensor<1x125xf16> to tensor<?xf16>
+        %expanded_14 = tensor.expand_shape %extracted_slice_13 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %extracted_slice_15 = tensor.extract_slice %expanded_5[0, %19] [1, %21] [1, 1] : tensor<1x125xf32> to tensor<?xf32>
+        %expanded_16 = tensor.expand_shape %extracted_slice_15 [[0, 1]] : tensor<?xf32> into tensor<1x?xf32>
+        %dim = tensor.dim %expanded_14, %c1 : tensor<1x?xf16>
+        %22 = arith.cmpi ugt, %dim, %c0 : index
+        %23 = scf.if %22 -> (f16) {
+          %extracted = tensor.extract %expanded_14[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %dim_17 = tensor.dim %expanded_16, %c1 : tensor<1x?xf32>
+        %24 = arith.cmpi ugt, %dim_17, %c0 : index
+        %25 = scf.if %24 -> (f32) {
+          %extracted = tensor.extract %expanded_16[%c0, %c0] : tensor<1x?xf32>
+          scf.yield %extracted : f32
+        } else {
+          scf.yield %cst_0 : f32
+        }
+        %26 = arith.extf %23 : f16 to f32
+        %27 = arith.mulf %26, %25 : f32
+        %28 = arith.addf %27, %cst_0 : f32
+        %extracted_slice_18 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<128xf32> to tensor<f32>
+        %inserted = tensor.insert %28 into %extracted_slice_18[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<128xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %5 [[0, 1]] : tensor<128xf32> into tensor<64x2xf32>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf32>
+      %7 = scf.forall (%arg4) in (64) shared_outs(%arg5 = %6) -> (tensor<64xf32>) {
+        %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<64x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_7[%arg4, %c1] : tensor<64x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<64xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<64xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %7 [[0, 1]] : tensor<64xf32> into tensor<32x2xf32>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %9 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %8) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %expanded_8[%arg4, %c0] : tensor<32x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_8[%arg4, %c1] : tensor<32x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_9 = tensor.expand_shape %9 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf32>
+      %11 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %10) -> (tensor<16xf32>) {
+        %extracted = tensor.extract %expanded_9[%arg4, %c0] : tensor<16x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_9[%arg4, %c1] : tensor<16x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<16xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_10 = tensor.expand_shape %11 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf32>
+      %13 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %12) -> (tensor<8xf32>) {
+        %extracted = tensor.extract %expanded_10[%arg4, %c0] : tensor<8x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_10[%arg4, %c1] : tensor<8x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<8xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_11 = tensor.expand_shape %13 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf32>
+      %15 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %14) -> (tensor<4xf32>) {
+        %extracted = tensor.extract %expanded_11[%arg4, %c0] : tensor<4x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_11[%arg4, %c1] : tensor<4x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<4xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_12 = tensor.expand_shape %15 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf32>
+      %17 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %16) -> (tensor<2xf32>) {
+        %extracted = tensor.extract %expanded_12[%arg4, %c0] : tensor<2x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_12[%arg4, %c1] : tensor<2x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<2xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %18 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %extracted_slice_6) -> (tensor<f32>) {
+        %19 = affine.apply #map4(%arg4)
+        %extracted = tensor.extract %17[%19] : tensor<2xf32>
+        %20 = arith.addf %extracted, %cst_0 : f32
+        %21 = affine.apply #map5(%arg4)
+        %extracted_13 = tensor.extract %17[%21] : tensor<2xf32>
+        %22 = arith.addf %extracted_13, %20 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[] [] [] : tensor<f32> to tensor<f32>
+        %inserted = tensor.insert %22 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor<f32> into tensor<f32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %18 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<32xf32>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %3 = scf.forall (%arg2) in (1) shared_outs(%arg3 = %0) -> (tensor<f32>) {
+      %4 = affine.apply #map10(%arg2)
+      %extracted_slice = tensor.extract_slice %2[%4] [32] [1] : tensor<32xf32> to tensor<32xf32>
+      %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<32xf32> into tensor<32x1xf32>
+      %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %6 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %5) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %expanded_3[%arg4, %c0] : tensor<32x1xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_slice_8 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %6 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32>
+      %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf32>
+      %8 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %7) -> (tensor<16xf32>) {
+        %extracted = tensor.extract %expanded_4[%arg4, %c0] : tensor<16x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_4[%arg4, %c1] : tensor<16x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<16xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %8 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32>
+      %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf32>
+      %10 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %9) -> (tensor<8xf32>) {
+        %extracted = tensor.extract %expanded_5[%arg4, %c0] : tensor<8x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_5[%arg4, %c1] : tensor<8x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<8xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+      %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf32>
+      %12 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %11) -> (tensor<4xf32>) {
+        %extracted = tensor.extract %expanded_6[%arg4, %c0] : tensor<4x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_6[%arg4, %c1] : tensor<4x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<4xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %12 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32>
+      %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf32>
+      %14 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %13) -> (tensor<2xf32>) {
+        %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<2x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_7[%arg4, %c1] : tensor<2x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<2xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %15 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %arg3) -> (tensor<f32>) {
+        %16 = affine.apply #map4(%arg4)
+        %extracted = tensor.extract %14[%16] : tensor<2xf32>
+        %17 = arith.addf %extracted, %cst_0 : f32
+        %18 = affine.apply #map5(%arg4)
+        %extracted_8 = tensor.extract %14[%18] : tensor<2xf32>
+        %19 = arith.addf %extracted_8, %17 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[] [] [] : tensor<f32> to tensor<f32>
+        %inserted = tensor.insert %19 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor<f32> into tensor<f32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %15 into %arg3[] [] [] : tensor<f32> into tensor<f32>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %3 : tensor<f32>
+  }
+  func.func private @Unknown148(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.000000e+00 : f32
     %0 = tensor.empty() : tensor<f32>
-    %1 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : tensor<f32>) outs(%0 : tensor<f32>) {
+    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : tensor<f32>) outs(%0 : tensor<f32>) {
     ^bb0(%in: f32, %out: f32):
       %2 = arith.negf %in : f32
       %3 = arith.divf %2, %cst : f32
@@ -1168,445 +2207,619 @@ module @IrToMhlo.2452 {
     } -> tensor<f32>
     return %1 : tensor<f32>
   }
-  func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs =  {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x3x7x7xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x3x7x7xf32>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf32>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf32>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf32>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf32>
+    }
     return %1 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf32>
+    }
     return %1 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs =  {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf32>
+    }
     return %1 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf32>
+    }
     return %1 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs =  {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x1x1xf32>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf32>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf32>
+    }
     return %1 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs =  {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf32>
+    }
     return %1 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf32>
+    }
     return %1 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs =  {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x1x1xf32>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf32>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf32>
+    }
     return %1 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs =  {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf32>
+    }
     return %1 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x512x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf32>
+    }
     return %1 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs =  {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x1x1xf32>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf32>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf32>
+    }
     return %1 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @MatmulOp162(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} {
+  func.func private @MatmulOp169(%arg0: tensor<4x512xf16>, %arg1: tensor<4x1000xf16>) -> tensor<1000x512xf16> attributes {__byre__lhs_contracting_dimension = 0 : i64, __byre__output_transpose, __byre__rhs_contracting_dimension = 0 : i64, byre_compute_name = "MatmulOp"} {
     %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<512x1000xf16>
     %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>, xla_shape = "f16[1000,512]{0,1}"} : (tensor<512x1000xf16>) -> tensor<1000x512xf16>
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf32>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) attrs =  {xla_shape = "f32[1000,512]{0,1}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1000x512xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f32> into tensor<1000x512xf32>
+        scf.yield %inserted_slice : tensor<1000x512xf32>
+      }
+      scf.yield %2 : tensor<1000x512xf32>
+    }
     return %1 : tensor<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %2 = arith.truncf %in : f32 to f16
-      %3 = arith.extf %2 : f16 to f32
-      linalg.yield %3 : f32
-    } -> tensor<1000xf32>
+    %1 = scf.forall (%arg1) in (32) shared_outs(%arg2 = %0) -> (tensor<1000xf32>) {
+      %2 = affine.min #map11(%arg1)
+      %3 = affine.apply #map10(%arg1)
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x32xf32>
+      %6 = scf.forall (%arg3, %arg4) in (2, 32) shared_outs(%arg5 = %5) -> (tensor<2x32xf32>) {
+        %8 = affine.min #map12(%arg4, %arg1)
+        %9 = affine.min #map13(%arg4, %arg1)
+        %10 = affine.apply #map3(%9, %8)
+        %11 = arith.cmpi ugt, %10, %c0 : index
+        %12 = scf.if %11 -> (f16) {
+          %19 = affine.apply #map4(%arg3)
+          %20 = affine.apply #map14(%arg1)[%8]
+          %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %13 = arith.extf %12 : f16 to f32
+        %14 = arith.addf %13, %cst_0 : f32
+        %15 = arith.cmpi ugt, %10, %c0 : index
+        %16 = scf.if %15 -> (f16) {
+          %19 = affine.apply #map5(%arg3)
+          %20 = affine.apply #map14(%arg1)[%8]
+          %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %17 = arith.extf %16 : f16 to f32
+        %18 = arith.addf %14, %17 : f32
+        %extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<2x32xf32> to tensor<f32>
+        %inserted = tensor.insert %18 into %extracted_slice_1[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x32xf32>
+        }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      %7 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %6[%c0, %arg3] : tensor<2x32xf32>
+        %8 = arith.addf %extracted, %cst_0 : f32
+        %extracted_1 = tensor.extract %6[%c1, %arg3] : tensor<2x32xf32>
+        %9 = arith.addf %extracted_1, %8 : f32
+        %extracted_slice_2 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %9 into %extracted_slice_2[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %extracted_slice = tensor.extract_slice %7[0] [%2] [1] : tensor<32xf32> to tensor<?xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %extracted_slice into %arg2[%3] [%2] [1] : tensor<?xf32> into tensor<1000xf32>
+      }
+    } {mapping = [#gpu.block<x>]}
     return %1 : tensor<1000xf32>
   }
-  func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) {
-    %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-    %1 = mhlo.constant dense<0.000000e+00> : tensor<f16>
-    %2 = mhlo.constant dense<0xFC00> : tensor<f16>
-    %3 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16>
-    %4 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
-    %5 = mhlo.convolution(%3, %4) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16>
-    %6 = call @BatchNormTrainingOp2(%5, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16>
-    %7 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %8 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %9 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %10 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %11 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
-    %12 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
-    %13 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %14 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %15 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %16 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
-    %17 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
-    %18 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %19 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %20 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %21 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
-    %22 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
-    %23 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %24 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %25 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %26 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16>
-    %27 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %28 = mhlo.reduce(%26 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
+  func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<1000xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf32>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %4 = arith.truncf %in : f32 to f16
+        %5 = arith.extf %4 : f16 to f32
+        linalg.yield %5 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f32> into tensor<1000xf32>
+      scf.yield %inserted_slice : tensor<1000xf32>
     }
-    %29:2 = call @Unknown24(%6) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
-    %30 = "mhlo.reduce_window"(%29#0, %2) ({
+    return %1 : tensor<1000xf32>
+  }
+  func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) {
+    %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+    %1 = mhlo.constant dense<0xFC00> : tensor<f16>
+    %2 = call @Unknown0(%arg0) : (tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16>
+    %3 = call @Unknown1(%arg2) : (tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16>
+    %4 = mhlo.convolution(%2, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[3, 3], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x3x224x224xf16>, tensor<64x3x7x7xf16>) -> tensor<4x64x112x112xf16>
+    %5 = call @BatchNormTrainingOp2(%4, %arg3, %arg4) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x112x112xf16>
+    %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
+    %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
+    %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
+    %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
+    %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
+    %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
+    %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16>
+    %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
+    %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16>
+    %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
+    %30 = "mhlo.reduce_window"(%29#0, %1) ({
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.maximum %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
+      %199 = mhlo.maximum %arg104, %arg105 : tensor<f16>
+      mhlo.return %199 : tensor<f16>
     }) {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<f16>) -> tensor<4x64x56x56xf16>
-    %31 = mhlo.convolution(%30, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %32 = call @BatchNormTrainingOp25(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %33:2 = call @Unknown26(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %34 = mhlo.convolution(%33#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %31 = mhlo.convolution(%30, %6) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %32 = call @BatchNormTrainingOp27(%31, %arg8, %arg9) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %33:2 = call @Unknown28(%32) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %34 = mhlo.convolution(%33#0, %7) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
     %35 = call @BatchNormTrainingOp27(%34, %arg13, %arg14) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %36:2 = call @Unknown28(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %37 = mhlo.convolution(%36#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %38 = call @BatchNormTrainingOp29(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %39:2 = call @Unknown30(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %40 = mhlo.convolution(%39#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %41 = call @BatchNormTrainingOp31(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
-    %42:2 = call @Unknown32(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    %43 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16>
-    %44 = call @BatchNormTrainingOp33(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %45 = mhlo.convolution(%42#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %46 = call @BatchNormTrainingOp34(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %47:2 = call @Unknown35(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %48 = mhlo.convolution(%47#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %49 = call @BatchNormTrainingOp36(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %50:2 = call @Unknown37(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %51 = mhlo.convolution(%50#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %52 = call @BatchNormTrainingOp38(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %53:2 = call @Unknown39(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %54 = mhlo.convolution(%53#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %55 = call @BatchNormTrainingOp40(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
-    %56:2 = call @Unknown41(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    %57 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16>
-    %58 = call @BatchNormTrainingOp42(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %59 = mhlo.convolution(%56#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %60 = call @BatchNormTrainingOp43(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %61:2 = call @Unknown44(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %62 = mhlo.convolution(%61#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %63 = call @BatchNormTrainingOp45(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %64:2 = call @Unknown46(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %65 = mhlo.convolution(%64#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %66 = call @BatchNormTrainingOp47(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %67:2 = call @Unknown48(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %68 = mhlo.convolution(%67#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %69 = call @BatchNormTrainingOp49(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
-    %70:2 = call @Unknown50(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    %71 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16>
-    %72 = call @BatchNormTrainingOp51(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %73 = mhlo.convolution(%70#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %74 = call @BatchNormTrainingOp52(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %75:2 = call @Unknown53(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %76 = mhlo.convolution(%75#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %77 = call @BatchNormTrainingOp54(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %78:2 = call @Unknown55(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %79 = mhlo.convolution(%78#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %80 = call @BatchNormTrainingOp56(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %81:2 = call @Unknown57(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %82 = mhlo.convolution(%81#0, %25) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %83 = call @BatchNormTrainingOp58(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
-    %84:2 = call @Unknown59(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %85 = mhlo.reduce(%84#0 init: %1) across dimensions = [3, 2] : (tensor<4x512x7x7xf16>, tensor<f16>) -> tensor<4x512xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %86 = call @Unknown60(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16>
-    %87 = "mhlo.dot_general"(%86, %27) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16>
-    %88 = call @Unknown61(%arg103, %87) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
-    %89 = mhlo.reduce(%88 init: %2) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.maximum %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %90:2 = call @Unknown62(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
-    %91 = mhlo.reduce(%90#1 init: %1) across dimensions = [1] : (tensor<4x1000xf16>, tensor<f16>) -> tensor<4xf16>
-     reducer(%arg104: tensor<f16>, %arg105: tensor<f16>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
-    }
-    %92:3 = call @Unknown63(%91, %90#0, %28, %26, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>)
-    %93 = "mhlo.dot"(%92#0, %27) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16>
-    %94 = call @Unknown64(%93, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %95:3 = call @BatchNormGradOp65(%82, %arg98, %94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %96 = call @ConvBackwardDataOp66(%95#0, %25) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %97 = call @ConvBackwardFilterOp67(%81#0, %95#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %98 = call @Unknown68(%81#1, %96) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %99:3 = call @BatchNormGradOp69(%79, %arg93, %98) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %100 = call @ConvBackwardDataOp70(%99#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %101 = call @ConvBackwardFilterOp71(%78#0, %99#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %102 = call @Unknown72(%94, %100, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %103:3 = call @BatchNormGradOp73(%76, %arg83, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %104 = call @ConvBackwardDataOp74(%103#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
-    %105 = call @ConvBackwardFilterOp75(%75#0, %103#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
-    %106 = call @Unknown76(%75#1, %104) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %107:3 = call @BatchNormGradOp77(%73, %arg78, %106) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %108 = call @ConvBackwardDataOp78(%107#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %109 = call @ConvBackwardFilterOp79(%70#0, %107#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16>
-    %110:3 = call @BatchNormGradOp80(%71, %arg88, %102) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
-    %111 = call @ConvBackwardDataOp81(%110#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16>
-    %112 = call @ConvBackwardFilterOp82(%70#0, %110#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16>
-    %113 = call @Unknown83(%111, %108, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %114:3 = call @BatchNormGradOp84(%68, %arg73, %113) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %115 = call @ConvBackwardDataOp85(%114#0, %20) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %116 = call @ConvBackwardFilterOp86(%67#0, %114#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %117 = call @Unknown87(%67#1, %115) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %118:3 = call @BatchNormGradOp88(%65, %arg68, %117) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %119 = call @ConvBackwardDataOp89(%118#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %120 = call @ConvBackwardFilterOp90(%64#0, %118#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %121 = call @Unknown91(%113, %119, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %122:3 = call @BatchNormGradOp92(%62, %arg58, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %123 = call @ConvBackwardDataOp93(%122#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
-    %124 = call @ConvBackwardFilterOp94(%61#0, %122#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
-    %125 = call @Unknown95(%61#1, %123) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %126:3 = call @BatchNormGradOp96(%59, %arg53, %125) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %127 = call @ConvBackwardDataOp97(%126#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %128 = call @ConvBackwardFilterOp98(%56#0, %126#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16>
-    %129:3 = call @BatchNormGradOp99(%57, %arg63, %121) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
-    %130 = call @ConvBackwardDataOp100(%129#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16>
-    %131 = call @ConvBackwardFilterOp101(%56#0, %129#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16>
-    %132 = call @Unknown102(%130, %127, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %133:3 = call @BatchNormGradOp103(%54, %arg48, %132) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %134 = call @ConvBackwardDataOp104(%133#0, %15) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %135 = call @ConvBackwardFilterOp105(%53#0, %133#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %136 = call @Unknown106(%53#1, %134) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %137:3 = call @BatchNormGradOp107(%51, %arg43, %136) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %138 = call @ConvBackwardDataOp108(%137#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %139 = call @ConvBackwardFilterOp109(%50#0, %137#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %140 = call @Unknown110(%132, %138, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %141:3 = call @BatchNormGradOp111(%48, %arg33, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %142 = call @ConvBackwardDataOp112(%141#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
-    %143 = call @ConvBackwardFilterOp113(%47#0, %141#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
-    %144 = call @Unknown114(%47#1, %142) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %145:3 = call @BatchNormGradOp115(%45, %arg28, %144) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %146 = call @ConvBackwardDataOp116(%145#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %147 = call @ConvBackwardFilterOp117(%42#0, %145#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16>
-    %148:3 = call @BatchNormGradOp118(%43, %arg38, %140) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
-    %149 = call @ConvBackwardDataOp119(%148#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16>
-    %150 = call @ConvBackwardFilterOp120(%42#0, %148#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16>
-    %151 = call @Unknown121(%149, %146, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %152:3 = call @BatchNormGradOp122(%40, %arg23, %151) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %153 = call @ConvBackwardDataOp123(%152#0, %10) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %154 = call @ConvBackwardFilterOp124(%39#0, %152#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %155 = call @Unknown125(%39#1, %153) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %156:3 = call @BatchNormGradOp126(%37, %arg18, %155) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %157 = call @ConvBackwardDataOp127(%156#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %158 = call @ConvBackwardFilterOp128(%36#0, %156#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %159 = call @Unknown129(%151, %157, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %160:3 = call @BatchNormGradOp130(%34, %arg13, %159) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %161 = call @ConvBackwardDataOp131(%160#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %162 = call @ConvBackwardFilterOp132(%33#0, %160#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %163 = call @Unknown133(%33#1, %161) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %164:3 = call @BatchNormGradOp134(%31, %arg8, %163) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
-    %165 = call @ConvBackwardDataOp135(%164#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
-    %166 = call @ConvBackwardFilterOp136(%30, %164#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
-    %167 = call @Unknown137(%159, %165) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %168 = "mhlo.select_and_scatter"(%29#0, %167, %1) ({
+    %36:2 = call @Unknown30(%35, %30) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %37 = mhlo.convolution(%36#0, %8) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %38 = call @BatchNormTrainingOp27(%37, %arg18, %arg19) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %39:2 = call @Unknown28(%38) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %40 = mhlo.convolution(%39#0, %9) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %41 = call @BatchNormTrainingOp27(%40, %arg23, %arg24) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) -> tensor<4x64x56x56xf16>
+    %42:2 = call @Unknown30(%41, %36#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %43 = mhlo.convolution(%42#0, %10) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) -> tensor<4x128x28x28xf16>
+    %44 = call @BatchNormTrainingOp35(%43, %arg38, %arg39) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %45 = mhlo.convolution(%42#0, %11) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %46 = call @BatchNormTrainingOp35(%45, %arg28, %arg29) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %47:2 = call @Unknown37(%46) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %48 = mhlo.convolution(%47#0, %12) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %49 = call @BatchNormTrainingOp35(%48, %arg33, %arg34) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %50:2 = call @Unknown39(%49, %44) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %51 = mhlo.convolution(%50#0, %13) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %52 = call @BatchNormTrainingOp35(%51, %arg43, %arg44) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %53:2 = call @Unknown37(%52) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %54 = mhlo.convolution(%53#0, %14) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %55 = call @BatchNormTrainingOp35(%54, %arg48, %arg49) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) -> tensor<4x128x28x28xf16>
+    %56:2 = call @Unknown39(%55, %50#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %57 = mhlo.convolution(%56#0, %15) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) -> tensor<4x256x14x14xf16>
+    %58 = call @BatchNormTrainingOp44(%57, %arg63, %arg64) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %59 = mhlo.convolution(%56#0, %16) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %60 = call @BatchNormTrainingOp44(%59, %arg53, %arg54) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %61:2 = call @Unknown46(%60) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %62 = mhlo.convolution(%61#0, %17) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %63 = call @BatchNormTrainingOp44(%62, %arg58, %arg59) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %64:2 = call @Unknown48(%63, %58) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %65 = mhlo.convolution(%64#0, %18) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %66 = call @BatchNormTrainingOp44(%65, %arg68, %arg69) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %67:2 = call @Unknown46(%66) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %68 = mhlo.convolution(%67#0, %19) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %69 = call @BatchNormTrainingOp44(%68, %arg73, %arg74) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) -> tensor<4x256x14x14xf16>
+    %70:2 = call @Unknown48(%69, %64#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %71 = mhlo.convolution(%70#0, %20) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[0, 0], [0, 0]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) -> tensor<4x512x7x7xf16>
+    %72 = call @BatchNormTrainingOp53(%71, %arg88, %arg89) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %73 = mhlo.convolution(%70#0, %21) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [2, 2], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %74 = call @BatchNormTrainingOp53(%73, %arg78, %arg79) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %75:2 = call @Unknown55(%74) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %76 = mhlo.convolution(%75#0, %22) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %77 = call @BatchNormTrainingOp53(%76, %arg83, %arg84) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %78:2 = call @Unknown57(%77, %72) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %79 = mhlo.convolution(%78#0, %23) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %80 = call @BatchNormTrainingOp53(%79, %arg93, %arg94) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %81:2 = call @Unknown55(%80) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %82 = mhlo.convolution(%81#0, %24) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %83 = call @BatchNormTrainingOp53(%82, %arg98, %arg99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) -> tensor<4x512x7x7xf16>
+    %84:2 = call @Unknown57(%83, %78#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %85 = call @Unknown62(%84#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16>
+    %86 = call @Unknown63(%85) : (tensor<4x512xf16>) -> tensor<4x512xf16>
+    %87 = "mhlo.dot_general"(%86, %26) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [1]>, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x512xf16>, tensor<1000x512xf16>) -> tensor<4x1000xf16>
+    %88 = call @Unknown64(%27, %87) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %89 = call @Unknown65(%88) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %90 = call @Unknown66(%89, %88) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %91 = call @Unknown67(%90) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %92 = call @Unknown68(%91) : (tensor<4xf16>) -> tensor<4xf16>
+    %93:2 = call @Unknown69(%92, %90, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
+    %94 = "mhlo.dot"(%93#1, %26) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<4x1000xf16>, tensor<1000x512xf16>) -> tensor<4x512xf16>
+    %95 = call @Unknown70(%94, %84#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %96:3 = call @BatchNormGradOp71(%82, %arg98, %95) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %97 = call @ConvBackwardDataOp72(%96#0, %24) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %98 = call @ConvBackwardFilterOp73(%81#0, %96#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %99 = call @Unknown74(%81#1, %97) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %100:3 = call @BatchNormGradOp71(%79, %arg93, %99) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %101 = call @ConvBackwardDataOp72(%100#0, %23) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %102 = call @ConvBackwardFilterOp73(%78#0, %100#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %103 = call @Unknown78(%95, %101, %78#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %104:3 = call @BatchNormGradOp71(%76, %arg83, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %105 = call @ConvBackwardDataOp72(%104#0, %22) : (tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) -> tensor<4x512x7x7xf16>
+    %106 = call @ConvBackwardFilterOp73(%75#0, %104#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> tensor<512x512x3x3xf16>
+    %107 = call @Unknown74(%75#1, %105) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %108:3 = call @BatchNormGradOp71(%73, %arg78, %107) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %109 = call @ConvBackwardDataOp84(%108#0, %21) : (tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %110 = call @ConvBackwardFilterOp85(%70#0, %108#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x3x3xf16>
+    %111:3 = call @BatchNormGradOp71(%71, %arg88, %103) : (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>)
+    %112 = call @ConvBackwardDataOp87(%111#0, %20) : (tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) -> tensor<4x256x14x14xf16>
+    %113 = call @ConvBackwardFilterOp88(%70#0, %111#0) : (tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) -> tensor<512x256x1x1xf16>
+    %114 = call @Unknown89(%112, %109, %70#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %115:3 = call @BatchNormGradOp90(%68, %arg73, %114) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %116 = call @ConvBackwardDataOp91(%115#0, %19) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %117 = call @ConvBackwardFilterOp92(%67#0, %115#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %118 = call @Unknown93(%67#1, %116) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %119:3 = call @BatchNormGradOp90(%65, %arg68, %118) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %120 = call @ConvBackwardDataOp91(%119#0, %18) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %121 = call @ConvBackwardFilterOp92(%64#0, %119#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %122 = call @Unknown89(%114, %120, %64#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %123:3 = call @BatchNormGradOp90(%62, %arg58, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %124 = call @ConvBackwardDataOp91(%123#0, %17) : (tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) -> tensor<4x256x14x14xf16>
+    %125 = call @ConvBackwardFilterOp92(%61#0, %123#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> tensor<256x256x3x3xf16>
+    %126 = call @Unknown93(%61#1, %124) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %127:3 = call @BatchNormGradOp90(%59, %arg53, %126) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %128 = call @ConvBackwardDataOp103(%127#0, %16) : (tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %129 = call @ConvBackwardFilterOp104(%56#0, %127#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x3x3xf16>
+    %130:3 = call @BatchNormGradOp90(%57, %arg63, %122) : (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>)
+    %131 = call @ConvBackwardDataOp106(%130#0, %15) : (tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) -> tensor<4x128x28x28xf16>
+    %132 = call @ConvBackwardFilterOp107(%56#0, %130#0) : (tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) -> tensor<256x128x1x1xf16>
+    %133 = call @Unknown108(%131, %128, %56#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %134:3 = call @BatchNormGradOp109(%54, %arg48, %133) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %135 = call @ConvBackwardDataOp110(%134#0, %14) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %136 = call @ConvBackwardFilterOp111(%53#0, %134#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %137 = call @Unknown112(%53#1, %135) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %138:3 = call @BatchNormGradOp109(%51, %arg43, %137) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %139 = call @ConvBackwardDataOp110(%138#0, %13) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %140 = call @ConvBackwardFilterOp111(%50#0, %138#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %141 = call @Unknown108(%133, %139, %50#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %142:3 = call @BatchNormGradOp109(%48, %arg33, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %143 = call @ConvBackwardDataOp110(%142#0, %12) : (tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) -> tensor<4x128x28x28xf16>
+    %144 = call @ConvBackwardFilterOp111(%47#0, %142#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> tensor<128x128x3x3xf16>
+    %145 = call @Unknown112(%47#1, %143) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %146:3 = call @BatchNormGradOp109(%45, %arg28, %145) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %147 = call @ConvBackwardDataOp122(%146#0, %11) : (tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %148 = call @ConvBackwardFilterOp123(%42#0, %146#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x3x3xf16>
+    %149:3 = call @BatchNormGradOp109(%43, %arg38, %141) : (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>)
+    %150 = call @ConvBackwardDataOp125(%149#0, %10) : (tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) -> tensor<4x64x56x56xf16>
+    %151 = call @ConvBackwardFilterOp126(%42#0, %149#0) : (tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) -> tensor<128x64x1x1xf16>
+    %152 = call @Unknown127(%150, %147, %42#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %153:3 = call @BatchNormGradOp128(%40, %arg23, %152) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %154 = call @ConvBackwardDataOp129(%153#0, %9) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %155 = call @ConvBackwardFilterOp130(%39#0, %153#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %156 = call @Unknown131(%39#1, %154) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %157:3 = call @BatchNormGradOp128(%37, %arg18, %156) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %158 = call @ConvBackwardDataOp129(%157#0, %8) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %159 = call @ConvBackwardFilterOp130(%36#0, %157#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %160 = call @Unknown127(%152, %158, %36#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %161:3 = call @BatchNormGradOp128(%34, %arg13, %160) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %162 = call @ConvBackwardDataOp129(%161#0, %7) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %163 = call @ConvBackwardFilterOp130(%33#0, %161#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %164 = call @Unknown131(%33#1, %162) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %165:3 = call @BatchNormGradOp128(%31, %arg8, %164) : (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>)
+    %166 = call @ConvBackwardDataOp129(%165#0, %6) : (tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) -> tensor<4x64x56x56xf16>
+    %167 = call @ConvBackwardFilterOp130(%30, %165#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<64x64x3x3xf16>
+    %168 = call @Unknown143(%160, %166) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %169 = "mhlo.select_and_scatter"(%29#0, %168, %0) ({
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.compare  GE, %arg104, %arg105 : (tensor<f16>, tensor<f16>) -> tensor<i1>
-      mhlo.return %198 : tensor<i1>
+      %199 = mhlo.compare  GE, %arg104, %arg105 : (tensor<f16>, tensor<f16>) -> tensor<i1>
+      mhlo.return %199 : tensor<i1>
     }, {
     ^bb0(%arg104: tensor<f16>, %arg105: tensor<f16>):
-      %198 = mhlo.add %arg104, %arg105 : tensor<f16>
-      mhlo.return %198 : tensor<f16>
+      %199 = mhlo.add %arg104, %arg105 : tensor<f16>
+      mhlo.return %199 : tensor<f16>
     }) {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : (tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>, tensor<f16>) -> tensor<4x64x112x112xf16>
-    %169 = call @Unknown138(%29#1, %168) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
-    %170:3 = call @BatchNormGradOp139(%5, %arg3, %169) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
-    %171 = call @ConvBackwardFilterOp140(%3, %170#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16>
-    %172 = mhlo.reduce(%92#1 init: %0) across dimensions = [0, 1] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<f32>
-     reducer(%arg104: tensor<f32>, %arg105: tensor<f32>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f32>
-      mhlo.return %198 : tensor<f32>
-    }
-    %173 = call @Unknown141(%172) : (tensor<f32>) -> tensor<f32>
-    %174 = call @Unknown142(%171) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %175 = call @Unknown143(%166) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %176 = call @Unknown144(%162) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %177 = call @Unknown145(%158) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %178 = call @Unknown146(%154) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %179 = call @Unknown147(%147) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %180 = call @Unknown148(%143) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %181 = call @Unknown149(%150) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %182 = call @Unknown150(%139) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %183 = call @Unknown151(%135) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %184 = call @Unknown152(%128) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %185 = call @Unknown153(%124) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %186 = call @Unknown154(%131) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %187 = call @Unknown155(%120) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %188 = call @Unknown156(%116) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %189 = call @Unknown157(%109) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %190 = call @Unknown158(%105) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %191 = call @Unknown159(%112) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %192 = call @Unknown160(%101) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %193 = call @Unknown161(%97) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %194 = call @MatmulOp162(%86, %92#0) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16>
-    %195 = call @Unknown163(%194) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %196 = mhlo.reduce(%92#2 init: %0) across dimensions = [0] : (tensor<4x1000xf32>, tensor<f32>) -> tensor<1000xf32>
-     reducer(%arg104: tensor<f32>, %arg105: tensor<f32>)  {
-      %198 = mhlo.add %arg104, %arg105 : tensor<f32>
-      mhlo.return %198 : tensor<f32>
-    }
-    %197 = call @Unknown164(%196) : (tensor<1000xf32>) -> tensor<1000xf32>
-    return %173, %174, %170#1, %170#2, %175, %164#1, %164#2, %176, %160#1, %160#2, %177, %156#1, %156#2, %178, %152#1, %152#2, %179, %145#1, %145#2, %180, %141#1, %141#2, %181, %148#1, %148#2, %182, %137#1, %137#2, %183, %133#1, %133#2, %184, %126#1, %126#2, %185, %122#1, %122#2, %186, %129#1, %129#2, %187, %118#1, %118#2, %188, %114#1, %114#2, %189, %107#1, %107#2, %190, %103#1, %103#2, %191, %110#1, %110#2, %192, %99#1, %99#2, %193, %95#1, %95#2, %195, %197 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
+    %170 = call @Unknown144(%29#1, %169) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
+    %171:3 = call @BatchNormGradOp145(%4, %arg3, %170) : (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>)
+    %172 = call @ConvBackwardFilterOp146(%2, %171#0) : (tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) -> tensor<64x3x7x7xf16>
+    %173 = call @Unknown147(%93#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor<f32>
+    %174 = call @Unknown148(%173) : (tensor<f32>) -> tensor<f32>
+    %175 = call @Unknown149(%172) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
+    %176 = call @Unknown150(%167) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %177 = call @Unknown150(%163) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %178 = call @Unknown150(%159) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %179 = call @Unknown150(%155) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %180 = call @Unknown154(%148) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %181 = call @Unknown155(%144) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %182 = call @Unknown156(%151) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %183 = call @Unknown155(%140) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %184 = call @Unknown155(%136) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %185 = call @Unknown159(%129) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %186 = call @Unknown160(%125) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %187 = call @Unknown161(%132) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %188 = call @Unknown160(%121) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %189 = call @Unknown160(%117) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %190 = call @Unknown164(%110) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %191 = call @Unknown165(%106) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %192 = call @Unknown166(%113) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %193 = call @Unknown165(%102) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %194 = call @Unknown165(%98) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %195 = call @MatmulOp169(%86, %93#1) : (tensor<4x512xf16>, tensor<4x1000xf16>) -> tensor<1000x512xf16>
+    %196 = call @Unknown170(%195) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %197 = call @Unknown171(%93#1) : (tensor<4x1000xf16>) -> tensor<1000xf32>
+    %198 = call @Unknown172(%197) : (tensor<1000xf32>) -> tensor<1000xf32>
+    return %174, %175, %171#1, %171#2, %176, %165#1, %165#2, %177, %161#1, %161#2, %178, %157#1, %157#2, %179, %153#1, %153#2, %180, %146#1, %146#2, %181, %142#1, %142#2, %182, %149#1, %149#2, %183, %138#1, %138#2, %184, %134#1, %134#2, %185, %127#1, %127#2, %186, %123#1, %123#2, %187, %130#1, %130#2, %188, %119#1, %119#2, %189, %115#1, %115#2, %190, %108#1, %108#2, %191, %104#1, %104#2, %192, %111#1, %111#2, %193, %100#1, %100#2, %194, %96#1, %96#2, %196, %198 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir b/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir
index 92a4816fa..5a9ff6014 100644
--- a/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/4_bufferize_opt.mlir
@@ -2,674 +2,2016 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map5 = affine_map<() -> ()>
-#map6 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)>
+#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
+#map6 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map8 = affine_map<(d0) -> (d0 mod 128, 125)>
+#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)>
+#map10 = affine_map<(d0) -> (d0 * 32)>
+#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)>
+#map12 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)>
+#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)>
+#map14 = affine_map<(d0)[s0] -> (d0 * 32 + s0)>
 module @IrToMhlo.2452 {
   func.func private @Unknown0(%arg0: tensor<4x3x224x224xf32>) -> tensor<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c224 = arith.constant 224 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x3x224x224xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x3x224x224xf32>) outs(%0 : tensor<4x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<4x3x224x224xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x3x224x224xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x3x224x224xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c224 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x3x224x224xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c224 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x3x224x224xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x3x224x224xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x3x224x224xf16>
+            scf.yield %inserted_slice : tensor<4x3x224x224xf16>
+          }
+          scf.yield %4 : tensor<4x3x224x224xf16>
+        }
+        scf.yield %3 : tensor<4x3x224x224xf16>
+      }
+      scf.yield %2 : tensor<4x3x224x224xf16>
+    }
     return %1 : tensor<4x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: tensor<64x3x7x7xf32>) -> tensor<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf32>) outs(%0 : tensor<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x3x7x7xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x3x7x7xf16>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf16>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf16>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf16>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf16>
+    }
     return %1 : tensor<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
-    return %1 : tensor<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf32>) outs(%0 : tensor<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<64x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<64x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf16>
+    }
     return %1 : tensor<64x64x3x3xf16>
   }
   func.func private @Unknown7(%arg0: tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf32>) outs(%0 : tensor<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x1x1xf16>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf16>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf16>
+    }
     return %1 : tensor<128x64x1x1xf16>
   }
   func.func private @Unknown8(%arg0: tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf32>) outs(%0 : tensor<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x64x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x64x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf16>
+    }
     return %1 : tensor<128x64x3x3xf16>
   }
   func.func private @Unknown9(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
-    return %1 : tensor<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf32>) outs(%0 : tensor<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<128x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<128x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf16>
+    }
     return %1 : tensor<128x128x3x3xf16>
   }
   func.func private @Unknown12(%arg0: tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf32>) outs(%0 : tensor<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x1x1xf16>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf16>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf16>
+    }
     return %1 : tensor<256x128x1x1xf16>
   }
   func.func private @Unknown13(%arg0: tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf32>) outs(%0 : tensor<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x128x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x128x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf16>
+    }
     return %1 : tensor<256x128x3x3xf16>
   }
   func.func private @Unknown14(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
-    return %1 : tensor<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf32>) outs(%0 : tensor<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<256x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<256x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf16>
+    }
     return %1 : tensor<256x256x3x3xf16>
   }
   func.func private @Unknown17(%arg0: tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf32>) outs(%0 : tensor<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x1x1xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x1x1xf16>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf16>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf16>
+    }
     return %1 : tensor<512x256x1x1xf16>
   }
   func.func private @Unknown18(%arg0: tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf32>) outs(%0 : tensor<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x256x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x256x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf16>
+    }
     return %1 : tensor<512x256x3x3xf16>
   }
   func.func private @Unknown19(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
-    return %1 : tensor<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf32>) outs(%0 : tensor<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<512x512x3x3xf16>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf16>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf16>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf32> to tensor<f32>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f32, %out: f16):
+              %7 = arith.truncf %in : f32 to f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<512x512x3x3xf16>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf16>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf16>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf16>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf16>
+    }
     return %1 : tensor<512x512x3x3xf16>
   }
   func.func private @Unknown22(%arg0: tensor<4x1000xf32>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant -2.500000e-01 : f32
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x1000xf32>) outs(%0 : tensor<4x1000xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.mulf %in, %cst : f32
-      %3 = arith.truncf %2 : f32 to f16
-      linalg.yield %3 : f16
-    } -> tensor<4x1000xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c1000 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x1000xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.mulf %in, %cst : f32
+          %6 = arith.truncf %5 : f32 to f16
+          linalg.yield %6 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
     return %1 : tensor<4x1000xf16>
   }
   func.func private @Unknown23(%arg0: tensor<1000x512xf32>) -> tensor<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf32>) outs(%0 : tensor<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %2 = arith.truncf %in : f32 to f16
-      linalg.yield %2 : f16
-    } -> tensor<1000x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf32> to tensor<f32>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f32, %out: f16):
+          %5 = arith.truncf %in : f32 to f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<1000x512xf16>
+        scf.yield %inserted_slice : tensor<1000x512xf16>
+      }
+      scf.yield %2 : tensor<1000x512xf16>
+    }
     return %1 : tensor<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown24(%arg0: tensor<1000xf32>) -> tensor<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<1000xf16>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f32, %out: f16):
+        %4 = arith.truncf %in : f32 to f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<1000xf16>
+      scf.yield %inserted_slice : tensor<1000xf16>
+    }
+    return %1 : tensor<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %26 = arith.addf %25, %cst : f16
+        %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %27 = arith.cmpi ugt, %dim_11, %c1 : index
+        %28 = scf.if %27 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %29 = arith.addf %26, %28 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %29 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = arith.addf %extracted, %cst : f16
+        %23 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%23] : tensor<2xf16>
+        %24 = arith.addf %extracted_9, %22 : f16
+        %inserted = tensor.insert %24 into %arg4[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown26(%arg0: tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x112x112xf16>
     %1 = tensor.empty() : tensor<4x64x112x112xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x112x112xf16>) outs(%0, %1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c112 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c112 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x112x112xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x112x112xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>
   }
-  func.func private @Unknown26(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
-  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
-    return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
-  }
-  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown28(%arg0: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
     %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c56 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x56x56xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @Unknown32(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown30(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
     %1 = tensor.empty() : tensor<4x64x56x56xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0, %1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c56 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x64x56x56xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
+    }
     return %2#0, %2#1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>
   }
-  func.func private @Unknown35(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown37(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
     %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c28 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x128x28x28xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+    }
     return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown39(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
     %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c28 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x128x28x28xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
+    }
     return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
   }
-  func.func private @Unknown41(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = tensor.empty() : tensor<4x128x28x28xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0, %1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
-    return %2#0, %2#1 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>
-  }
-  func.func private @Unknown44(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown46(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
     %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c14 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x256x14x14xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+    }
     return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown48(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
     %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c14 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x256x14x14xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
+    }
     return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
   }
-  func.func private @Unknown50(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = tensor.empty() : tensor<4x256x14x14xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0, %1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
-    return %2#0, %2#1 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>
-  }
-  func.func private @Unknown53(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
     %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %2:2 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0, %arg3 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+      %3:2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg2, %arg6 = %arg3) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+        %4:2 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+          %5:2 = scf.for %arg10 = %c0 to %c7 step %c1 iter_args(%arg11 = %arg8, %arg12 = %arg9) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %out: f16, %out_1: i1):
+              %9 = arith.maximumf %in, %cst : f16
+              %10 = arith.cmpf ogt, %9, %cst : f16
+              linalg.yield %9, %10 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg11[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            %inserted_slice_0 = tensor.insert_slice %8#1 into %arg12[%arg1, %arg4, %arg7, %arg10] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x512x7x7xi1>
+            scf.yield %inserted_slice, %inserted_slice_0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+    }
     return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @Unknown55(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
     %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %2:2 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+      %3:2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg3, %arg7 = %arg4) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+        %4:2 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg6, %arg10 = %arg7) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+          %5:2 = scf.for %arg11 = %c0 to %c7 step %c1 iter_args(%arg12 = %arg9, %arg13 = %arg10) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %6 = tensor.empty() : tensor<f16>
+            %7 = tensor.empty() : tensor<i1>
+            %8:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%6, %7 : tensor<f16>, tensor<i1>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16, %out_3: i1):
+              %9 = arith.addf %in, %in_2 : f16
+              %10 = arith.maximumf %9, %cst : f16
+              %11 = arith.cmpf ogt, %10, %cst : f16
+              linalg.yield %10, %11 : f16, i1
+            } -> (tensor<f16>, tensor<i1>)
+            %inserted_slice = tensor.insert_slice %8#0 into %arg12[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            %inserted_slice_1 = tensor.insert_slice %8#1 into %arg13[%arg2, %arg5, %arg8, %arg11] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<i1> into tensor<4x512x7x7xi1>
+            scf.yield %inserted_slice, %inserted_slice_1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+          }
+          scf.yield %5#0, %5#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+        }
+        scf.yield %4#0, %4#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+      }
+      scf.yield %3#0, %3#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
+    }
     return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
   }
-  func.func private @Unknown57(%arg0: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown62(%arg0: tensor<4x512x7x7xf16>) -> tensor<4x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
     %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_0: i1):
-      %3 = arith.maxnumf %in, %cst : f16
-      %4 = arith.cmpf ogt, %3, %cst : f16
-      linalg.yield %3, %4 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
-  }
-  func.func private @Unknown59(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = tensor.empty() : tensor<4x512x7x7xi1>
-    %2:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0, %1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: i1):
-      %3 = arith.addf %in, %in_0 : f16
-      %4 = arith.maxnumf %3, %cst : f16
-      %5 = arith.cmpf ogt, %4, %cst : f16
-      linalg.yield %4, %5 : f16, i1
-    } -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    return %2#0, %2#1 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>
-  }
-  func.func private @Unknown60(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] : tensor<4x512x7x7xf16> into tensor<2048x49xf16>
+    %0 = tensor.empty() : tensor<2048xf16>
+    %1 = scf.forall (%arg1) in (2048) shared_outs(%arg2 = %0) -> (tensor<2048xf16>) {
+      %extracted_slice = tensor.extract_slice %collapsed[%arg1, 0] [1, 49] [1, 1] : tensor<2048x49xf16> to tensor<49xf16>
+      %expanded_0 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<49xf16> into tensor<1x49xf16>
+      %extracted_slice_1 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<2048xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %3 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %2) -> (tensor<64xf16>) {
+        %15 = affine.min #map6(%arg3)
+        %16 = affine.min #map7(%arg3)
+        %17 = affine.apply #map3(%16, %15)
+        %extracted_slice_7 = tensor.extract_slice %expanded_0[0, %15] [1, %17] [1, 1] : tensor<1x49xf16> to tensor<?xf16>
+        %expanded_8 = tensor.expand_shape %extracted_slice_7 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %extracted_slice_7, %c0 : tensor<?xf16>
+        %18 = arith.cmpi ugt, %dim, %c0 : index
+        %19 = scf.if %18 -> (f16) {
+          %extracted = tensor.extract %expanded_8[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %20 = arith.addf %19, %cst : f16
+        %extracted_slice_9 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %20 into %extracted_slice_9[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %3 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %5 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<32x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_2[%arg3, %c1] : tensor<32x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %5 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %7 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %6) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<16x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_3[%arg3, %c1] : tensor<16x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %7 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %9 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %8) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<8x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_4[%arg3, %c1] : tensor<8x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %9 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %11 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %10) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<4x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_5[%arg3, %c1] : tensor<4x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %11 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %13 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %12) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<2x2xf16>
+        %15 = arith.addf %extracted, %cst : f16
+        %extracted_7 = tensor.extract %expanded_6[%arg3, %c1] : tensor<2x2xf16>
+        %16 = arith.addf %extracted_7, %15 : f16
+        %extracted_slice_8 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %14 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_1) -> (tensor<f16>) {
+        %15 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %13[%15] : tensor<2xf16>
+        %16 = arith.addf %extracted, %cst : f16
+        %17 = affine.apply #map5(%arg3)
+        %extracted_7 = tensor.extract %13[%17] : tensor<2xf16>
+        %18 = arith.addf %extracted_7, %16 : f16
+        %inserted = tensor.insert %18 into %arg4[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %14 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<2048xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %expanded = tensor.expand_shape %1 [[0, 1]] : tensor<2048xf16> into tensor<4x512xf16>
+    return %expanded : tensor<4x512xf16>
+  }
+  func.func private @Unknown63(%arg0: tensor<4x512xf16>) -> tensor<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 2.040100e-02 : f16
     %0 = tensor.empty() : tensor<4x512xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<4x512xf16>) outs(%0 : tensor<4x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %2 = arith.mulf %in, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4x512xf16>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<4x512xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<4x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %out: f16):
+          %5 = arith.mulf %in, %cst : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f16> into tensor<4x512xf16>
+        scf.yield %inserted_slice : tensor<4x512xf16>
+      }
+      scf.yield %2 : tensor<4x512xf16>
+    }
     return %1 : tensor<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: tensor<1000xf32>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown64(%arg0: tensor<1000xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<1000xf32>) outs(%0 : tensor<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %2 = arith.truncf %in_0 : f32 to f16
-      %3 = arith.addf %in, %2 : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<1000xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %in_1: f16, %out: f16):
+          %5 = arith.addf %in_1, %in : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
     return %1 : tensor<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown65(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %26 = arith.cmpi ugt, %dim_11, %c1 : index
+        %27 = scf.if %26 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %28 = arith.maximumf %25, %27 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %28 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %21 = arith.maximumf %extracted_9, %extracted : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %21 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%22] : tensor<2xf16>
+        %23 = arith.maximumf %extracted_9, %extracted : f16
+        %inserted = tensor.insert %23 into %arg4[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown66(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>) -> tensor<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1:2 = linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x1000xf16>, tensor<4xf16>) outs(%0, %0 : tensor<4x1000xf16>, tensor<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16, %out_1: f16):
-      %2 = arith.subf %in, %in_0 : f16
-      %3 = math.exp %2 : f16
-      linalg.yield %2, %3 : f16, f16
-    } -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
-    return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x1000xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c1000 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg2] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%3 : tensor<f16>) {
+        ^bb0(%in: f16, %in_1: f16, %out: f16):
+          %5 = arith.subf %in_1, %in : f16
+          linalg.yield %5 : f16
+        } -> tensor<f16>
+        %inserted_slice = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice : tensor<4x1000xf16>
+      }
+      scf.yield %2 : tensor<4x1000xf16>
+    }
+    return %1 : tensor<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>, %arg4: tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown67(%arg0: tensor<4x1000xf16>) -> tensor<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [1, 1000] [1, 1] : tensor<4x1000xf16> to tensor<1000xf16>
+      %expanded = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<1000xf16> into tensor<1x1000xf16>
+      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<512xf16>
+      %3 = scf.forall (%arg3) in (512) shared_outs(%arg4 = %2) -> (tensor<512xf16>) {
+        %21 = affine.min #map1(%arg3)
+        %22 = affine.min #map2(%arg3)
+        %23 = affine.apply #map3(%22, %21)
+        %extracted_slice_9 = tensor.extract_slice %expanded[0, %21] [1, %23] [1, 1] : tensor<1x1000xf16> to tensor<?xf16>
+        %expanded_10 = tensor.expand_shape %extracted_slice_9 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %dim = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %24 = arith.cmpi ugt, %dim, %c0 : index
+        %25 = scf.if %24 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %26 = math.exp %25 : f16
+        %27 = arith.addf %26, %cst : f16
+        %dim_11 = tensor.dim %extracted_slice_9, %c0 : tensor<?xf16>
+        %28 = arith.cmpi ugt, %dim_11, %c1 : index
+        %29 = scf.if %28 -> (f16) {
+          %extracted = tensor.extract %expanded_10[%c0, %c1] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %30 = math.exp %29 : f16
+        %31 = arith.addf %27, %30 : f16
+        %extracted_slice_12 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<512xf16> to tensor<f16>
+        %inserted = tensor.insert %31 into %extracted_slice_12[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<512xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_1 = tensor.expand_shape %3 [[0, 1]] : tensor<512xf16> into tensor<256x2xf16>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<256xf16>
+      %5 = scf.forall (%arg3) in (256) shared_outs(%arg4 = %4) -> (tensor<256xf16>) {
+        %extracted = tensor.extract %expanded_1[%arg3, %c0] : tensor<256x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_1[%arg3, %c1] : tensor<256x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<256xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<256xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_2 = tensor.expand_shape %5 [[0, 1]] : tensor<256xf16> into tensor<128x2xf16>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf16>
+      %7 = scf.forall (%arg3) in (128) shared_outs(%arg4 = %6) -> (tensor<128xf16>) {
+        %extracted = tensor.extract %expanded_2[%arg3, %c0] : tensor<128x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_2[%arg3, %c1] : tensor<128x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<128xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<128xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_3 = tensor.expand_shape %7 [[0, 1]] : tensor<128xf16> into tensor<64x2xf16>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf16>
+      %9 = scf.forall (%arg3) in (64) shared_outs(%arg4 = %8) -> (tensor<64xf16>) {
+        %extracted = tensor.extract %expanded_3[%arg3, %c0] : tensor<64x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_3[%arg3, %c1] : tensor<64x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<64xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<64xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %9 [[0, 1]] : tensor<64xf16> into tensor<32x2xf16>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf16>
+      %11 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %10) -> (tensor<32xf16>) {
+        %extracted = tensor.extract %expanded_4[%arg3, %c0] : tensor<32x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_4[%arg3, %c1] : tensor<32x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<32xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %11 [[0, 1]] : tensor<32xf16> into tensor<16x2xf16>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf16>
+      %13 = scf.forall (%arg3) in (16) shared_outs(%arg4 = %12) -> (tensor<16xf16>) {
+        %extracted = tensor.extract %expanded_5[%arg3, %c0] : tensor<16x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_5[%arg3, %c1] : tensor<16x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<16xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<16xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %13 [[0, 1]] : tensor<16xf16> into tensor<8x2xf16>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf16>
+      %15 = scf.forall (%arg3) in (8) shared_outs(%arg4 = %14) -> (tensor<8xf16>) {
+        %extracted = tensor.extract %expanded_6[%arg3, %c0] : tensor<8x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_6[%arg3, %c1] : tensor<8x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<8xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<8xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %15 [[0, 1]] : tensor<8xf16> into tensor<4x2xf16>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf16>
+      %17 = scf.forall (%arg3) in (4) shared_outs(%arg4 = %16) -> (tensor<4xf16>) {
+        %extracted = tensor.extract %expanded_7[%arg3, %c0] : tensor<4x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_7[%arg3, %c1] : tensor<4x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<4xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<4xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %17 [[0, 1]] : tensor<4xf16> into tensor<2x2xf16>
+      %18 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf16>
+      %19 = scf.forall (%arg3) in (2) shared_outs(%arg4 = %18) -> (tensor<2xf16>) {
+        %extracted = tensor.extract %expanded_8[%arg3, %c0] : tensor<2x2xf16>
+        %21 = arith.addf %extracted, %cst : f16
+        %extracted_9 = tensor.extract %expanded_8[%arg3, %c1] : tensor<2x2xf16>
+        %22 = arith.addf %extracted_9, %21 : f16
+        %extracted_slice_10 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<2xf16> to tensor<f16>
+        %inserted = tensor.insert %22 into %extracted_slice_10[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f16> into tensor<2xf16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %20 = scf.forall (%arg3) in (1) shared_outs(%arg4 = %extracted_slice_0) -> (tensor<f16>) {
+        %21 = affine.apply #map4(%arg3)
+        %extracted = tensor.extract %19[%21] : tensor<2xf16>
+        %22 = arith.addf %extracted, %cst : f16
+        %23 = affine.apply #map5(%arg3)
+        %extracted_9 = tensor.extract %19[%23] : tensor<2xf16>
+        %24 = arith.addf %extracted_9, %22 : f16
+        %inserted = tensor.insert %24 into %arg4[] : tensor<f16>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[] [] [] : tensor<f16> into tensor<f16>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %20 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown68(%arg0: tensor<4xf16>) -> tensor<4xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<4xf16>
+    %1 = scf.for %arg1 = %c0 to %c4 step %c1 iter_args(%arg2 = %0) -> (tensor<4xf16>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<4xf16> to tensor<f16>
+      %2 = tensor.empty() : tensor<f16>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%2 : tensor<f16>) {
+      ^bb0(%in: f16, %out: f16):
+        %4 = math.log %in : f16
+        linalg.yield %4 : f16
+      } -> tensor<f16>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f16> into tensor<4xf16>
+      scf.yield %inserted_slice : tensor<4xf16>
+    }
+    return %1 : tensor<4xf16>
+  }
+  func.func private @Unknown69(%arg0: tensor<4xf16>, %arg1: tensor<4x1000xf16>, %arg2: tensor<4xf16>, %arg3: tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x1000xf16>
-    %1 = tensor.empty() : tensor<4x1000xf32>
-    %2:3 = linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : tensor<4x1000xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4xf16>, tensor<4x1000xf32>) outs(%0, %1, %1 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>) {
-    ^bb0(%in: f16, %in_0: f16, %in_1: f16, %in_2: f16, %in_3: f32, %out: f16, %out_4: f32, %out_5: f32):
-      %3 = math.log %in_1 : f16
-      %4 = arith.subf %in_0, %3 : f16
-      %5 = math.exp %4 : f16
-      %6 = arith.mulf %5, %in_2 : f16
-      %7 = arith.subf %in, %6 : f16
-      %8 = arith.extf %4 : f16 to f32
-      %9 = arith.mulf %8, %in_3 : f32
-      %10 = arith.extf %7 : f16 to f32
-      linalg.yield %7, %9, %10 : f16, f32, f32
-    } -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>)
-    return %2#0, %2#1, %2#2 : tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>
+    %1:2 = scf.for %arg4 = %c0 to %c4 step %c1 iter_args(%arg5 = %0, %arg6 = %0) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) {
+      %2:2 = scf.for %arg7 = %c0 to %c1000 step %c1 iter_args(%arg8 = %arg5, %arg9 = %arg6) -> (tensor<4x1000xf16>, tensor<4x1000xf16>) {
+        %extracted_slice = tensor.extract_slice %arg2[%arg4] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_0 = tensor.extract_slice %arg0[%arg4] [1] [1] : tensor<4xf16> to tensor<f16>
+        %extracted_slice_1 = tensor.extract_slice %arg1[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %extracted_slice_2 = tensor.extract_slice %arg3[%arg4, %arg7] [1, 1] [1, 1] : tensor<4x1000xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f16>
+        %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1, %extracted_slice_2 : tensor<f16>, tensor<f16>, tensor<f16>, tensor<f16>) outs(%3, %3 : tensor<f16>, tensor<f16>) {
+        ^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f16, %out: f16, %out_7: f16):
+          %5 = arith.subf %in_5, %in_4 : f16
+          %6 = math.exp %5 : f16
+          %7 = arith.mulf %6, %in : f16
+          %8 = arith.subf %in_6, %7 : f16
+          linalg.yield %5, %8 : f16, f16
+        } -> (tensor<f16>, tensor<f16>)
+        %inserted_slice = tensor.insert_slice %4#0 into %arg8[%arg4, %arg7] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        %inserted_slice_3 = tensor.insert_slice %4#1 into %arg9[%arg4, %arg7] [1, 1] [1, 1] : tensor<f16> into tensor<4x1000xf16>
+        scf.yield %inserted_slice, %inserted_slice_3 : tensor<4x1000xf16>, tensor<4x1000xf16>
+      }
+      scf.yield %2#0, %2#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
+    }
+    return %1#0, %1#1 : tensor<4x1000xf16>, tensor<4x1000xf16>
   }
-  func.func private @Unknown64(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown70(%arg0: tensor<4x512xf16>, %arg1: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant 4.900000e+01 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : tensor<4x512x7x7xi1>, tensor<4x512xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_1: f16, %out: f16):
-      %2 = arith.divf %in_1, %cst_0 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4] [1, 1] [1, 1] : tensor<4x512xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_1 : tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: i1, %out: f16):
+              %7 = arith.divf %in, %cst_0 : f16
+              %8 = arith.select %in_2, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @Unknown68(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown74(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c512 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c7 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c7 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @Unknown72(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: tensor<4x512x7x7xf16>, %arg1: tensor<4x512x7x7xf16>, %arg2: tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x512x7x7xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x512x7x7xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c512 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x512x7x7xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x512x7x7xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c7 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x512x7x7xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x512x7x7xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x512x7x7xf16>
+            scf.yield %inserted_slice : tensor<4x512x7x7xf16>
+          }
+          scf.yield %4 : tensor<4x512x7x7xf16>
+        }
+        scf.yield %3 : tensor<4x512x7x7xf16>
+      }
+      scf.yield %2 : tensor<4x512x7x7xf16>
+    }
     return %1 : tensor<4x512x7x7xf16>
   }
-  func.func private @Unknown76(%arg0: tensor<4x512x7x7xi1>, %arg1: tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x512x7x7xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) outs(%0 : tensor<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x512x7x7xf16>
-    return %1 : tensor<4x512x7x7xf16>
-  }
-  func.func private @Unknown83(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x256x14x14xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x256x14x14xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c256 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x256x14x14xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c14 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x256x14x14xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c14 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x256x14x14xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            scf.yield %inserted_slice : tensor<4x256x14x14xf16>
+          }
+          scf.yield %4 : tensor<4x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<4x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<4x256x14x14xf16>
+    }
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @Unknown87(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown93(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x256x14x14xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x256x14x14xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c256 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x256x14x14xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c14 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x256x14x14xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c14 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x256x14x14xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x256x14x14xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x256x14x14xf16>
+            scf.yield %inserted_slice : tensor<4x256x14x14xf16>
+          }
+          scf.yield %4 : tensor<4x256x14x14xf16>
+        }
+        scf.yield %3 : tensor<4x256x14x14xf16>
+      }
+      scf.yield %2 : tensor<4x256x14x14xf16>
+    }
     return %1 : tensor<4x256x14x14xf16>
   }
-  func.func private @Unknown91(%arg0: tensor<4x256x14x14xf16>, %arg1: tensor<4x256x14x14xf16>, %arg2: tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown95(%arg0: tensor<4x256x14x14xi1>, %arg1: tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x256x14x14xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) outs(%0 : tensor<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x256x14x14xf16>
-    return %1 : tensor<4x256x14x14xf16>
-  }
-  func.func private @Unknown102(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown106(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown108(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x128x28x28xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x128x28x28xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c128 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x128x28x28xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c28 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x128x28x28xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c28 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x128x28x28xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            scf.yield %inserted_slice : tensor<4x128x28x28xf16>
+          }
+          scf.yield %4 : tensor<4x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<4x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<4x128x28x28xf16>
+    }
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @Unknown110(%arg0: tensor<4x128x28x28xf16>, %arg1: tensor<4x128x28x28xf16>, %arg2: tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown112(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+    %c28 = arith.constant 28 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x128x28x28xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x128x28x28xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c128 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x128x28x28xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c28 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x128x28x28xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c28 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x128x28x28xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x128x28x28xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x128x28x28xf16>
+            scf.yield %inserted_slice : tensor<4x128x28x28xf16>
+          }
+          scf.yield %4 : tensor<4x128x28x28xf16>
+        }
+        scf.yield %3 : tensor<4x128x28x28xf16>
+      }
+      scf.yield %2 : tensor<4x128x28x28xf16>
+    }
     return %1 : tensor<4x128x28x28xf16>
   }
-  func.func private @Unknown114(%arg0: tensor<4x128x28x28xi1>, %arg1: tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x128x28x28xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) outs(%0 : tensor<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x128x28x28xf16>
-    return %1 : tensor<4x128x28x28xf16>
-  }
-  func.func private @Unknown121(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown127(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg3 = %c0 to %c4 step %c1 iter_args(%arg4 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg5 = %c0 to %c64 step %c1 iter_args(%arg6 = %arg4) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg7 = %c0 to %c56 step %c1 iter_args(%arg8 = %arg6) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg9 = %c0 to %c56 step %c1 iter_args(%arg10 = %arg8) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_1 = tensor.extract_slice %arg2[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor<i1>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0, %extracted_slice_1 : tensor<f16>, tensor<f16>, tensor<i1>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_2: f16, %in_3: i1, %out: f16):
+              %7 = arith.addf %in, %in_2 : f16
+              %8 = arith.select %in_3, %7, %cst : f16
+              linalg.yield %8 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg10[%arg3, %arg5, %arg7, %arg9] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown125(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown131(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown129(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>, %arg2: tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+  func.func private @Unknown143(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c56 = arith.constant 56 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %2 = arith.addf %in_0, %in_1 : f16
-      %3 = arith.select %in, %2, %cst : f16
-      linalg.yield %3 : f16
-    } -> tensor<4x64x56x56xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x56x56xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x56x56xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c56 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x56x56xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c56 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x56x56xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x56x56xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<f16>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: f16, %in_1: f16, %out: f16):
+              %7 = arith.addf %in, %in_1 : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x56x56xf16>
+            scf.yield %inserted_slice : tensor<4x64x56x56xf16>
+          }
+          scf.yield %4 : tensor<4x64x56x56xf16>
+        }
+        scf.yield %3 : tensor<4x64x56x56xf16>
+      }
+      scf.yield %2 : tensor<4x64x56x56xf16>
+    }
     return %1 : tensor<4x64x56x56xf16>
   }
-  func.func private @Unknown133(%arg0: tensor<4x64x56x56xi1>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown137(%arg0: tensor<4x64x56x56xf16>, %arg1: tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<4x64x56x56xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%0 : tensor<4x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %2 = arith.addf %in, %in_0 : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x56x56xf16>
-    return %1 : tensor<4x64x56x56xf16>
-  }
-  func.func private @Unknown138(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown144(%arg0: tensor<4x64x112x112xi1>, %arg1: tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %0 = tensor.empty() : tensor<4x64x112x112xf16>
-    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) outs(%0 : tensor<4x64x112x112xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %2 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %2 : f16
-    } -> tensor<4x64x112x112xf16>
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %0) -> (tensor<4x64x112x112xf16>) {
+      %2 = scf.for %arg4 = %c0 to %c64 step %c1 iter_args(%arg5 = %arg3) -> (tensor<4x64x112x112xf16>) {
+        %3 = scf.for %arg6 = %c0 to %c112 step %c1 iter_args(%arg7 = %arg5) -> (tensor<4x64x112x112xf16>) {
+          %4 = scf.for %arg8 = %c0 to %c112 step %c1 iter_args(%arg9 = %arg7) -> (tensor<4x64x112x112xf16>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xi1> to tensor<i1>
+            %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<4x64x112x112xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f16>
+            %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%extracted_slice, %extracted_slice_0 : tensor<i1>, tensor<f16>) outs(%5 : tensor<f16>) {
+            ^bb0(%in: i1, %in_1: f16, %out: f16):
+              %7 = arith.select %in, %in_1, %cst : f16
+              linalg.yield %7 : f16
+            } -> tensor<f16>
+            %inserted_slice = tensor.insert_slice %6 into %arg9[%arg2, %arg4, %arg6, %arg8] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f16> into tensor<4x64x112x112xf16>
+            scf.yield %inserted_slice : tensor<4x64x112x112xf16>
+          }
+          scf.yield %4 : tensor<4x64x112x112xf16>
+        }
+        scf.yield %3 : tensor<4x64x112x112xf16>
+      }
+      scf.yield %2 : tensor<4x64x112x112xf16>
+    }
     return %1 : tensor<4x64x112x112xf16>
   }
-  func.func private @Unknown141(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: tensor<4x1000xf16>, %arg1: tensor<4x1000xf32>) -> tensor<f32> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<f32>
+    %collapsed = tensor.collapse_shape %arg0 [[0, 1]] : tensor<4x1000xf16> into tensor<4000xf16>
+    %collapsed_1 = tensor.collapse_shape %arg1 [[0, 1]] : tensor<4x1000xf32> into tensor<4000xf32>
+    %expanded = tensor.expand_shape %collapsed [[0, 1]] : tensor<4000xf16> into tensor<32x125xf16>
+    %expanded_2 = tensor.expand_shape %collapsed_1 [[0, 1]] : tensor<4000xf32> into tensor<32x125xf32>
+    %1 = tensor.empty() : tensor<32xf32>
+    %2 = scf.forall (%arg2) in (32) shared_outs(%arg3 = %1) -> (tensor<32xf32>) {
+      %extracted_slice = tensor.extract_slice %expanded[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf16> to tensor<125xf16>
+      %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<125xf16> into tensor<1x125xf16>
+      %extracted_slice_4 = tensor.extract_slice %expanded_2[%arg2, 0] [1, 125] [1, 1] : tensor<32x125xf32> to tensor<125xf32>
+      %expanded_5 = tensor.expand_shape %extracted_slice_4 [[0, 1]] : tensor<125xf32> into tensor<1x125xf32>
+      %extracted_slice_6 = tensor.extract_slice %arg3[%arg2] [1] [1] : tensor<32xf32> to tensor<f32>
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<128xf32>
+      %5 = scf.forall (%arg4) in (128) shared_outs(%arg5 = %4) -> (tensor<128xf32>) {
+        %19 = affine.min #map8(%arg4)
+        %20 = affine.min #map9(%arg4)
+        %21 = affine.apply #map3(%20, %19)
+        %extracted_slice_13 = tensor.extract_slice %expanded_3[0, %19] [1, %21] [1, 1] : tensor<1x125xf16> to tensor<?xf16>
+        %expanded_14 = tensor.expand_shape %extracted_slice_13 [[0, 1]] : tensor<?xf16> into tensor<1x?xf16>
+        %extracted_slice_15 = tensor.extract_slice %expanded_5[0, %19] [1, %21] [1, 1] : tensor<1x125xf32> to tensor<?xf32>
+        %expanded_16 = tensor.expand_shape %extracted_slice_15 [[0, 1]] : tensor<?xf32> into tensor<1x?xf32>
+        %dim = tensor.dim %extracted_slice_13, %c0 : tensor<?xf16>
+        %22 = arith.cmpi ugt, %dim, %c0 : index
+        %23 = scf.if %22 -> (f16) {
+          %extracted = tensor.extract %expanded_14[%c0, %c0] : tensor<1x?xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %dim_17 = tensor.dim %extracted_slice_15, %c0 : tensor<?xf32>
+        %24 = arith.cmpi ugt, %dim_17, %c0 : index
+        %25 = scf.if %24 -> (f32) {
+          %extracted = tensor.extract %expanded_16[%c0, %c0] : tensor<1x?xf32>
+          scf.yield %extracted : f32
+        } else {
+          scf.yield %cst_0 : f32
+        }
+        %26 = arith.extf %23 : f16 to f32
+        %27 = arith.mulf %26, %25 : f32
+        %28 = arith.addf %27, %cst_0 : f32
+        %extracted_slice_18 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<128xf32> to tensor<f32>
+        %inserted = tensor.insert %28 into %extracted_slice_18[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<128xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %5 [[0, 1]] : tensor<128xf32> into tensor<64x2xf32>
+      %6 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64xf32>
+      %7 = scf.forall (%arg4) in (64) shared_outs(%arg5 = %6) -> (tensor<64xf32>) {
+        %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<64x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_7[%arg4, %c1] : tensor<64x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<64xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<64xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_8 = tensor.expand_shape %7 [[0, 1]] : tensor<64xf32> into tensor<32x2xf32>
+      %8 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %9 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %8) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %expanded_8[%arg4, %c0] : tensor<32x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_8[%arg4, %c1] : tensor<32x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_9 = tensor.expand_shape %9 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32>
+      %10 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf32>
+      %11 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %10) -> (tensor<16xf32>) {
+        %extracted = tensor.extract %expanded_9[%arg4, %c0] : tensor<16x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_9[%arg4, %c1] : tensor<16x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<16xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_10 = tensor.expand_shape %11 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32>
+      %12 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf32>
+      %13 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %12) -> (tensor<8xf32>) {
+        %extracted = tensor.extract %expanded_10[%arg4, %c0] : tensor<8x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_10[%arg4, %c1] : tensor<8x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<8xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_11 = tensor.expand_shape %13 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+      %14 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf32>
+      %15 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %14) -> (tensor<4xf32>) {
+        %extracted = tensor.extract %expanded_11[%arg4, %c0] : tensor<4x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_11[%arg4, %c1] : tensor<4x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<4xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_12 = tensor.expand_shape %15 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32>
+      %16 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf32>
+      %17 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %16) -> (tensor<2xf32>) {
+        %extracted = tensor.extract %expanded_12[%arg4, %c0] : tensor<2x2xf32>
+        %19 = arith.addf %extracted, %cst_0 : f32
+        %extracted_13 = tensor.extract %expanded_12[%arg4, %c1] : tensor<2x2xf32>
+        %20 = arith.addf %extracted_13, %19 : f32
+        %extracted_slice_14 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor<f32>
+        %inserted = tensor.insert %20 into %extracted_slice_14[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<2xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %18 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %extracted_slice_6) -> (tensor<f32>) {
+        %19 = affine.apply #map4(%arg4)
+        %extracted = tensor.extract %17[%19] : tensor<2xf32>
+        %20 = arith.addf %extracted, %cst_0 : f32
+        %21 = affine.apply #map5(%arg4)
+        %extracted_13 = tensor.extract %17[%21] : tensor<2xf32>
+        %22 = arith.addf %extracted_13, %20 : f32
+        %inserted = tensor.insert %22 into %arg5[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor<f32> into tensor<f32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %18 into %arg3[%arg2] [1] [1] : tensor<f32> into tensor<32xf32>
+      }
+    } {mapping = [#gpu.block<x>]}
+    %3 = scf.forall (%arg2) in (1) shared_outs(%arg3 = %0) -> (tensor<f32>) {
+      %4 = affine.apply #map10(%arg2)
+      %extracted_slice = tensor.extract_slice %2[%4] [32] [1] : tensor<32xf32> to tensor<32xf32>
+      %expanded_3 = tensor.expand_shape %extracted_slice [[0, 1]] : tensor<32xf32> into tensor<32x1xf32>
+      %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %6 = scf.forall (%arg4) in (32) shared_outs(%arg5 = %5) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %expanded_3[%arg4, %c0] : tensor<32x1xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_slice_8 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %16 into %extracted_slice_8[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_4 = tensor.expand_shape %6 [[0, 1]] : tensor<32xf32> into tensor<16x2xf32>
+      %7 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<16xf32>
+      %8 = scf.forall (%arg4) in (16) shared_outs(%arg5 = %7) -> (tensor<16xf32>) {
+        %extracted = tensor.extract %expanded_4[%arg4, %c0] : tensor<16x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_4[%arg4, %c1] : tensor<16x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<16xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<16xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_5 = tensor.expand_shape %8 [[0, 1]] : tensor<16xf32> into tensor<8x2xf32>
+      %9 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<8xf32>
+      %10 = scf.forall (%arg4) in (8) shared_outs(%arg5 = %9) -> (tensor<8xf32>) {
+        %extracted = tensor.extract %expanded_5[%arg4, %c0] : tensor<8x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_5[%arg4, %c1] : tensor<8x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<8xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<8xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_6 = tensor.expand_shape %10 [[0, 1]] : tensor<8xf32> into tensor<4x2xf32>
+      %11 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<4xf32>
+      %12 = scf.forall (%arg4) in (4) shared_outs(%arg5 = %11) -> (tensor<4xf32>) {
+        %extracted = tensor.extract %expanded_6[%arg4, %c0] : tensor<4x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_6[%arg4, %c1] : tensor<4x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<4xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<4xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %expanded_7 = tensor.expand_shape %12 [[0, 1]] : tensor<4xf32> into tensor<2x2xf32>
+      %13 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2xf32>
+      %14 = scf.forall (%arg4) in (2) shared_outs(%arg5 = %13) -> (tensor<2xf32>) {
+        %extracted = tensor.extract %expanded_7[%arg4, %c0] : tensor<2x2xf32>
+        %16 = arith.addf %extracted, %cst_0 : f32
+        %extracted_8 = tensor.extract %expanded_7[%arg4, %c1] : tensor<2x2xf32>
+        %17 = arith.addf %extracted_8, %16 : f32
+        %extracted_slice_9 = tensor.extract_slice %arg5[%arg4] [1] [1] : tensor<2xf32> to tensor<f32>
+        %inserted = tensor.insert %17 into %extracted_slice_9[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg4] [1] [1] : tensor<f32> into tensor<2xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %15 = scf.forall (%arg4) in (1) shared_outs(%arg5 = %arg3) -> (tensor<f32>) {
+        %16 = affine.apply #map4(%arg4)
+        %extracted = tensor.extract %14[%16] : tensor<2xf32>
+        %17 = arith.addf %extracted, %cst_0 : f32
+        %18 = affine.apply #map5(%arg4)
+        %extracted_8 = tensor.extract %14[%18] : tensor<2xf32>
+        %19 = arith.addf %extracted_8, %17 : f32
+        %inserted = tensor.insert %19 into %arg5[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[] [] [] : tensor<f32> into tensor<f32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %15 into %arg3[] [] [] : tensor<f32> into tensor<f32>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %3 : tensor<f32>
+  }
+  func.func private @Unknown148(%arg0: tensor<f32>) -> tensor<f32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.000000e+00 : f32
     %0 = tensor.empty() : tensor<f32>
-    %1 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : tensor<f32>) outs(%0 : tensor<f32>) {
+    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : tensor<f32>) outs(%0 : tensor<f32>) {
     ^bb0(%in: f32, %out: f32):
       %2 = arith.negf %in : f32
       %3 = arith.divf %2, %cst : f32
@@ -677,203 +2019,400 @@ module @IrToMhlo.2452 {
     } -> tensor<f32>
     return %1 : tensor<f32>
   }
-  func.func private @Unknown142(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown149(%arg0: tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x3x7x7xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x3x7x7xf16>) outs(%0 : tensor<64x3x7x7xf32>) attrs =  {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x3x7x7xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x3x7x7xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x3x7x7xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c7 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x3x7x7xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c7 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x3x7x7xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x3x7x7xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x3x7x7xf32>
+            scf.yield %inserted_slice : tensor<64x3x7x7xf32>
+          }
+          scf.yield %4 : tensor<64x3x7x7xf32>
+        }
+        scf.yield %3 : tensor<64x3x7x7xf32>
+      }
+      scf.yield %2 : tensor<64x3x7x7xf32>
+    }
     return %1 : tensor<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown150(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c64 step %c1 iter_args(%arg2 = %0) -> (tensor<64x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<64x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<64x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<64x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<64x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<64x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<64x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<64x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<64x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<64x64x3x3xf32>
+    }
     return %1 : tensor<64x64x3x3xf32>
   }
-  func.func private @Unknown144(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<64x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<64x64x3x3xf16>) outs(%0 : tensor<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<64x64x3x3xf32>
-    return %1 : tensor<64x64x3x3xf32>
-  }
-  func.func private @Unknown147(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown154(%arg0: tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x3x3xf16>) outs(%0 : tensor<128x64x3x3xf32>) attrs =  {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x64x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x64x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x64x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x64x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x64x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x64x3x3xf32>
+    }
     return %1 : tensor<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown155(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<128x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<128x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<128x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<128x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<128x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<128x128x3x3xf32>
+    }
     return %1 : tensor<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<128x64x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x64x1x1xf16>) outs(%0 : tensor<128x64x1x1xf32>) attrs =  {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x64x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c128 step %c1 iter_args(%arg2 = %0) -> (tensor<128x64x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c64 step %c1 iter_args(%arg4 = %arg2) -> (tensor<128x64x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<128x64x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<128x64x1x1xf32>
+        scf.yield %inserted_slice : tensor<128x64x1x1xf32>
+      }
+      scf.yield %2 : tensor<128x64x1x1xf32>
+    }
     return %1 : tensor<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<128x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<128x128x3x3xf16>) outs(%0 : tensor<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<128x128x3x3xf32>
-    return %1 : tensor<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown159(%arg0: tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x3x3xf16>) outs(%0 : tensor<256x128x3x3xf32>) attrs =  {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x128x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x128x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x128x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x128x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x128x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x128x3x3xf32>
+    }
     return %1 : tensor<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown160(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<256x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<256x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<256x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<256x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<256x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<256x256x3x3xf32>
+    }
     return %1 : tensor<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<256x128x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x128x1x1xf16>) outs(%0 : tensor<256x128x1x1xf32>) attrs =  {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x128x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c256 step %c1 iter_args(%arg2 = %0) -> (tensor<256x128x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %arg2) -> (tensor<256x128x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<256x128x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<256x128x1x1xf32>
+        scf.yield %inserted_slice : tensor<256x128x1x1xf32>
+      }
+      scf.yield %2 : tensor<256x128x1x1xf32>
+    }
     return %1 : tensor<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<256x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<256x256x3x3xf16>) outs(%0 : tensor<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<256x256x3x3xf32>
-    return %1 : tensor<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown164(%arg0: tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x3x3xf16>) outs(%0 : tensor<512x256x3x3xf32>) attrs =  {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x256x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x256x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x256x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x256x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x256x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x256x3x3xf32>
+    }
     return %1 : tensor<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown165(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x512x3x3xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x512x3x3xf32>) {
+        %3 = scf.for %arg5 = %c0 to %c3 step %c1 iter_args(%arg6 = %arg4) -> (tensor<512x512x3x3xf32>) {
+          %4 = scf.for %arg7 = %c0 to %c3 step %c1 iter_args(%arg8 = %arg6) -> (tensor<512x512x3x3xf32>) {
+            %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x512x3x3xf16> to tensor<f16>
+            %5 = tensor.empty() : tensor<f32>
+            %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%5 : tensor<f32>) {
+            ^bb0(%in: f16, %out: f32):
+              %7 = arith.extf %in : f16 to f32
+              linalg.yield %7 : f32
+            } -> tensor<f32>
+            %inserted_slice = tensor.insert_slice %6 into %arg8[%arg1, %arg3, %arg5, %arg7] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x512x3x3xf32>
+            scf.yield %inserted_slice : tensor<512x512x3x3xf32>
+          }
+          scf.yield %4 : tensor<512x512x3x3xf32>
+        }
+        scf.yield %3 : tensor<512x512x3x3xf32>
+      }
+      scf.yield %2 : tensor<512x512x3x3xf32>
+    }
     return %1 : tensor<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<512x256x1x1xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x256x1x1xf16>) outs(%0 : tensor<512x256x1x1xf32>) attrs =  {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x256x1x1xf32>
+    %1 = scf.for %arg1 = %c0 to %c512 step %c1 iter_args(%arg2 = %0) -> (tensor<512x256x1x1xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c256 step %c1 iter_args(%arg4 = %arg2) -> (tensor<512x256x1x1xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<512x256x1x1xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<f32> into tensor<512x256x1x1xf32>
+        scf.yield %inserted_slice : tensor<512x256x1x1xf32>
+      }
+      scf.yield %2 : tensor<512x256x1x1xf32>
+    }
     return %1 : tensor<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %0 = tensor.empty() : tensor<512x512x3x3xf32>
-    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x512x3x3xf16>) outs(%0 : tensor<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<512x512x3x3xf32>
-    return %1 : tensor<512x512x3x3xf32>
-  }
-  func.func private @Unknown163(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown170(%arg0: tensor<1000x512xf16>) -> tensor<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %0 = tensor.empty() : tensor<1000x512xf32>
-    %1 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<1000x512xf16>) outs(%0 : tensor<1000x512xf32>) attrs =  {xla_shape = "f32[1000,512]{0,1}"} {
-    ^bb0(%in: f16, %out: f32):
-      %2 = arith.extf %in : f16 to f32
-      linalg.yield %2 : f32
-    } -> tensor<1000x512xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000x512xf32>) {
+      %2 = scf.for %arg3 = %c0 to %c512 step %c1 iter_args(%arg4 = %arg2) -> (tensor<1000x512xf32>) {
+        %extracted_slice = tensor.extract_slice %arg0[%arg1, %arg3] [1, 1] [1, 1] : tensor<1000x512xf16> to tensor<f16>
+        %3 = tensor.empty() : tensor<f32>
+        %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f16>) outs(%3 : tensor<f32>) {
+        ^bb0(%in: f16, %out: f32):
+          %5 = arith.extf %in : f16 to f32
+          linalg.yield %5 : f32
+        } -> tensor<f32>
+        %inserted_slice = tensor.insert_slice %4 into %arg4[%arg1, %arg3] [1, 1] [1, 1] : tensor<f32> into tensor<1000x512xf32>
+        scf.yield %inserted_slice : tensor<1000x512xf32>
+      }
+      scf.yield %2 : tensor<1000x512xf32>
+    }
     return %1 : tensor<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: tensor<4x1000xf16>) -> tensor<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<1000xf32>
-    %1 = linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : tensor<1000xf32>) outs(%0 : tensor<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %2 = arith.truncf %in : f32 to f16
-      %3 = arith.extf %2 : f16 to f32
-      linalg.yield %3 : f32
-    } -> tensor<1000xf32>
+    %1 = scf.forall (%arg1) in (32) shared_outs(%arg2 = %0) -> (tensor<1000xf32>) {
+      %2 = affine.min #map11(%arg1)
+      %3 = affine.apply #map10(%arg1)
+      %4 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<32xf32>
+      %5 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<2x32xf32>
+      %6 = scf.forall (%arg3, %arg4) in (2, 32) shared_outs(%arg5 = %5) -> (tensor<2x32xf32>) {
+        %8 = affine.min #map12(%arg4, %arg1)
+        %9 = affine.min #map13(%arg4, %arg1)
+        %10 = affine.apply #map3(%9, %8)
+        %11 = arith.cmpi ugt, %10, %c0 : index
+        %12 = scf.if %11 -> (f16) {
+          %19 = affine.apply #map4(%arg3)
+          %20 = affine.apply #map14(%arg1)[%8]
+          %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %13 = arith.extf %12 : f16 to f32
+        %14 = arith.addf %13, %cst_0 : f32
+        %15 = arith.cmpi ugt, %10, %c0 : index
+        %16 = scf.if %15 -> (f16) {
+          %19 = affine.apply #map5(%arg3)
+          %20 = affine.apply #map14(%arg1)[%8]
+          %extracted = tensor.extract %arg0[%19, %20] : tensor<4x1000xf16>
+          scf.yield %extracted : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %17 = arith.extf %16 : f16 to f32
+        %18 = arith.addf %14, %17 : f32
+        %extracted_slice_1 = tensor.extract_slice %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<2x32xf32> to tensor<f32>
+        %inserted = tensor.insert %18 into %extracted_slice_1[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg5[%arg3, %arg4] [1, 1] [1, 1] : tensor<f32> into tensor<2x32xf32>
+        }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      %7 = scf.forall (%arg3) in (32) shared_outs(%arg4 = %4) -> (tensor<32xf32>) {
+        %extracted = tensor.extract %6[%c0, %arg3] : tensor<2x32xf32>
+        %8 = arith.addf %extracted, %cst_0 : f32
+        %extracted_1 = tensor.extract %6[%c1, %arg3] : tensor<2x32xf32>
+        %9 = arith.addf %extracted_1, %8 : f32
+        %extracted_slice_2 = tensor.extract_slice %arg4[%arg3] [1] [1] : tensor<32xf32> to tensor<f32>
+        %inserted = tensor.insert %9 into %extracted_slice_2[] : tensor<f32>
+        scf.forall.in_parallel {
+          tensor.parallel_insert_slice %inserted into %arg4[%arg3] [1] [1] : tensor<f32> into tensor<32xf32>
+        }
+      } {mapping = [#gpu.thread<x>]}
+      %extracted_slice = tensor.extract_slice %7[0] [%2] [1] : tensor<32xf32> to tensor<?xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %extracted_slice into %arg2[%3] [%2] [1] : tensor<?xf32> into tensor<1000xf32>
+      }
+    } {mapping = [#gpu.block<x>]}
+    return %1 : tensor<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: tensor<1000xf32>) -> tensor<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
+    %0 = tensor.empty() : tensor<1000xf32>
+    %1 = scf.for %arg1 = %c0 to %c1000 step %c1 iter_args(%arg2 = %0) -> (tensor<1000xf32>) {
+      %extracted_slice = tensor.extract_slice %arg0[%arg1] [1] [1] : tensor<1000xf32> to tensor<f32>
+      %2 = tensor.empty() : tensor<f32>
+      %3 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%extracted_slice : tensor<f32>) outs(%2 : tensor<f32>) {
+      ^bb0(%in: f32, %out: f32):
+        %4 = arith.truncf %in : f32 to f16
+        %5 = arith.extf %4 : f16 to f32
+        linalg.yield %5 : f32
+      } -> tensor<f32>
+      %inserted_slice = tensor.insert_slice %3 into %arg2[%arg1] [1] [1] : tensor<f32> into tensor<1000xf32>
+      scf.yield %inserted_slice : tensor<1000xf32>
+    }
     return %1 : tensor<1000xf32>
   }
   func.func @main(%arg0: tensor<4x3x224x224xf32>, %arg1: tensor<4x1000xf32>, %arg2: tensor<64x3x7x7xf32>, %arg3: tensor<64xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64xf32>, %arg7: tensor<64x64x3x3xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<64xf32>, %arg16: tensor<64xf32>, %arg17: tensor<64x64x3x3xf32>, %arg18: tensor<64xf32>, %arg19: tensor<64xf32>, %arg20: tensor<64xf32>, %arg21: tensor<64xf32>, %arg22: tensor<64x64x3x3xf32>, %arg23: tensor<64xf32>, %arg24: tensor<64xf32>, %arg25: tensor<64xf32>, %arg26: tensor<64xf32>, %arg27: tensor<128x64x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<128xf32>, %arg31: tensor<128xf32>, %arg32: tensor<128x128x3x3xf32>, %arg33: tensor<128xf32>, %arg34: tensor<128xf32>, %arg35: tensor<128xf32>, %arg36: tensor<128xf32>, %arg37: tensor<128x64x1x1xf32>, %arg38: tensor<128xf32>, %arg39: tensor<128xf32>, %arg40: tensor<128xf32>, %arg41: tensor<128xf32>, %arg42: tensor<128x128x3x3xf32>, %arg43: tensor<128xf32>, %arg44: tensor<128xf32>, %arg45: tensor<128xf32>, %arg46: tensor<128xf32>, %arg47: tensor<128x128x3x3xf32>, %arg48: tensor<128xf32>, %arg49: tensor<128xf32>, %arg50: tensor<128xf32>, %arg51: tensor<128xf32>, %arg52: tensor<256x128x3x3xf32>, %arg53: tensor<256xf32>, %arg54: tensor<256xf32>, %arg55: tensor<256xf32>, %arg56: tensor<256xf32>, %arg57: tensor<256x256x3x3xf32>, %arg58: tensor<256xf32>, %arg59: tensor<256xf32>, %arg60: tensor<256xf32>, %arg61: tensor<256xf32>, %arg62: tensor<256x128x1x1xf32>, %arg63: tensor<256xf32>, %arg64: tensor<256xf32>, %arg65: tensor<256xf32>, %arg66: tensor<256xf32>, %arg67: tensor<256x256x3x3xf32>, %arg68: tensor<256xf32>, %arg69: tensor<256xf32>, %arg70: tensor<256xf32>, %arg71: tensor<256xf32>, %arg72: tensor<256x256x3x3xf32>, %arg73: tensor<256xf32>, %arg74: tensor<256xf32>, %arg75: tensor<256xf32>, %arg76: tensor<256xf32>, %arg77: tensor<512x256x3x3xf32>, %arg78: tensor<512xf32>, %arg79: tensor<512xf32>, %arg80: tensor<512xf32>, %arg81: tensor<512xf32>, %arg82: tensor<512x512x3x3xf32>, %arg83: tensor<512xf32>, %arg84: tensor<512xf32>, %arg85: tensor<512xf32>, %arg86: tensor<512xf32>, %arg87: tensor<512x256x1x1xf32>, %arg88: tensor<512xf32>, %arg89: tensor<512xf32>, %arg90: tensor<512xf32>, %arg91: tensor<512xf32>, %arg92: tensor<512x512x3x3xf32>, %arg93: tensor<512xf32>, %arg94: tensor<512xf32>, %arg95: tensor<512xf32>, %arg96: tensor<512xf32>, %arg97: tensor<512x512x3x3xf32>, %arg98: tensor<512xf32>, %arg99: tensor<512xf32>, %arg100: tensor<512xf32>, %arg101: tensor<512xf32>, %arg102: tensor<1000x512xf32>, %arg103: tensor<1000xf32>) -> (tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>) attributes {__placeholder__byre.entry_point} {
@@ -884,51 +2423,51 @@ module @IrToMhlo.2452 {
     %4 = tensor.empty() : tensor<4x64x112x112xf16>
     %5 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %arg4 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) outs(%4 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16>
     %6 = call @Unknown3(%arg7) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %7 = call @Unknown4(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %8 = call @Unknown5(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
-    %9 = call @Unknown6(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %7 = call @Unknown3(%arg12) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %8 = call @Unknown3(%arg17) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
+    %9 = call @Unknown3(%arg22) : (tensor<64x64x3x3xf32>) -> tensor<64x64x3x3xf16>
     %10 = call @Unknown7(%arg37) : (tensor<128x64x1x1xf32>) -> tensor<128x64x1x1xf16>
     %11 = call @Unknown8(%arg27) : (tensor<128x64x3x3xf32>) -> tensor<128x64x3x3xf16>
     %12 = call @Unknown9(%arg32) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %13 = call @Unknown10(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
-    %14 = call @Unknown11(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %13 = call @Unknown9(%arg42) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
+    %14 = call @Unknown9(%arg47) : (tensor<128x128x3x3xf32>) -> tensor<128x128x3x3xf16>
     %15 = call @Unknown12(%arg62) : (tensor<256x128x1x1xf32>) -> tensor<256x128x1x1xf16>
     %16 = call @Unknown13(%arg52) : (tensor<256x128x3x3xf32>) -> tensor<256x128x3x3xf16>
     %17 = call @Unknown14(%arg57) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %18 = call @Unknown15(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
-    %19 = call @Unknown16(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %18 = call @Unknown14(%arg67) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
+    %19 = call @Unknown14(%arg72) : (tensor<256x256x3x3xf32>) -> tensor<256x256x3x3xf16>
     %20 = call @Unknown17(%arg87) : (tensor<512x256x1x1xf32>) -> tensor<512x256x1x1xf16>
     %21 = call @Unknown18(%arg77) : (tensor<512x256x3x3xf32>) -> tensor<512x256x3x3xf16>
     %22 = call @Unknown19(%arg82) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %23 = call @Unknown20(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
-    %24 = call @Unknown21(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %23 = call @Unknown19(%arg92) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
+    %24 = call @Unknown19(%arg97) : (tensor<512x512x3x3xf32>) -> tensor<512x512x3x3xf16>
     %25 = call @Unknown22(%arg1) : (tensor<4x1000xf32>) -> tensor<4x1000xf16>
     %26 = call @Unknown23(%arg102) : (tensor<1000x512xf32>) -> tensor<1000x512xf16>
-    %27 = tensor.empty() : tensor<4xf16>
-    %28 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%25 : tensor<4x1000xf16>) outs(%27 : tensor<4xf16>) : tensor<4xf16>
-    %29:2 = call @Unknown24(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
+    %27 = call @Unknown24(%arg103) : (tensor<1000xf32>) -> tensor<1000xf16>
+    %28 = call @Unknown25(%25) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %29:2 = call @Unknown26(%5) : (tensor<4x64x112x112xf16>) -> (tensor<4x64x112x112xf16>, tensor<4x64x112x112xi1>)
     %30 = tensor.empty() : tensor<4x64x56x56xf16>
     %31 = byre.compute_on_tensor @PoolMaxOp_f16_f16 {base_dilations = dense<1> : tensor<4xi64>, padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0 : tensor<4x64x112x112xf16>) outs(%30 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
     %32 = tensor.empty() : tensor<4x64x56x56xf16>
     %33 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%32 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
     %34 = tensor.empty() : tensor<4x64x56x56xf16>
     %35 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %arg9 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%34 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %36:2 = call @Unknown26(%35) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %36:2 = call @Unknown28(%35) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
     %37 = tensor.empty() : tensor<4x64x56x56xf16>
     %38 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%37 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
     %39 = tensor.empty() : tensor<4x64x56x56xf16>
     %40 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %arg14 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%39 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %41:2 = call @Unknown28(%40, %31) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %41:2 = call @Unknown30(%40, %31) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
     %42 = tensor.empty() : tensor<4x64x56x56xf16>
     %43 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%42 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
     %44 = tensor.empty() : tensor<4x64x56x56xf16>
     %45 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %arg19 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%44 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %46:2 = call @Unknown30(%45) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %46:2 = call @Unknown28(%45) : (tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
     %47 = tensor.empty() : tensor<4x64x56x56xf16>
     %48 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%47 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
     %49 = tensor.empty() : tensor<4x64x56x56xf16>
     %50 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %arg24 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) outs(%49 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %51:2 = call @Unknown32(%50, %41#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
+    %51:2 = call @Unknown30(%50, %41#0) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> (tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>)
     %52 = tensor.empty() : tensor<4x128x28x28xf16>
     %53 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %10 : tensor<4x64x56x56xf16>, tensor<128x64x1x1xf16>) outs(%52 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
     %54 = tensor.empty() : tensor<4x128x28x28xf16>
@@ -937,22 +2476,22 @@ module @IrToMhlo.2452 {
     %57 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %11 : tensor<4x64x56x56xf16>, tensor<128x64x3x3xf16>) outs(%56 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
     %58 = tensor.empty() : tensor<4x128x28x28xf16>
     %59 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %arg29 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%58 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %60:2 = call @Unknown35(%59) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %60:2 = call @Unknown37(%59) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
     %61 = tensor.empty() : tensor<4x128x28x28xf16>
     %62 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%61 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
     %63 = tensor.empty() : tensor<4x128x28x28xf16>
     %64 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %arg34 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%63 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %65:2 = call @Unknown37(%64, %55) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %65:2 = call @Unknown39(%64, %55) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
     %66 = tensor.empty() : tensor<4x128x28x28xf16>
     %67 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%66 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
     %68 = tensor.empty() : tensor<4x128x28x28xf16>
     %69 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %arg44 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%68 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %70:2 = call @Unknown39(%69) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %70:2 = call @Unknown37(%69) : (tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
     %71 = tensor.empty() : tensor<4x128x28x28xf16>
     %72 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%71 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
     %73 = tensor.empty() : tensor<4x128x28x28xf16>
     %74 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %arg49 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) outs(%73 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %75:2 = call @Unknown41(%74, %65#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
+    %75:2 = call @Unknown39(%74, %65#0) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) -> (tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>)
     %76 = tensor.empty() : tensor<4x256x14x14xf16>
     %77 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %15 : tensor<4x128x28x28xf16>, tensor<256x128x1x1xf16>) outs(%76 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
     %78 = tensor.empty() : tensor<4x256x14x14xf16>
@@ -961,22 +2500,22 @@ module @IrToMhlo.2452 {
     %81 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %16 : tensor<4x128x28x28xf16>, tensor<256x128x3x3xf16>) outs(%80 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
     %82 = tensor.empty() : tensor<4x256x14x14xf16>
     %83 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %arg54 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%82 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %84:2 = call @Unknown44(%83) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %84:2 = call @Unknown46(%83) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
     %85 = tensor.empty() : tensor<4x256x14x14xf16>
     %86 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%85 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
     %87 = tensor.empty() : tensor<4x256x14x14xf16>
     %88 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %arg59 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%87 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %89:2 = call @Unknown46(%88, %79) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %89:2 = call @Unknown48(%88, %79) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
     %90 = tensor.empty() : tensor<4x256x14x14xf16>
     %91 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%90 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
     %92 = tensor.empty() : tensor<4x256x14x14xf16>
     %93 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %arg69 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%92 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %94:2 = call @Unknown48(%93) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %94:2 = call @Unknown46(%93) : (tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
     %95 = tensor.empty() : tensor<4x256x14x14xf16>
     %96 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%95 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
     %97 = tensor.empty() : tensor<4x256x14x14xf16>
     %98 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %arg74 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) outs(%97 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %99:2 = call @Unknown50(%98, %89#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
+    %99:2 = call @Unknown48(%98, %89#0) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) -> (tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>)
     %100 = tensor.empty() : tensor<4x512x7x7xf16>
     %101 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %20 : tensor<4x256x14x14xf16>, tensor<512x256x1x1xf16>) outs(%100 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
     %102 = tensor.empty() : tensor<4x512x7x7xf16>
@@ -985,243 +2524,239 @@ module @IrToMhlo.2452 {
     %105 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %21 : tensor<4x256x14x14xf16>, tensor<512x256x3x3xf16>) outs(%104 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
     %106 = tensor.empty() : tensor<4x512x7x7xf16>
     %107 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %arg79 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%106 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %108:2 = call @Unknown53(%107) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %108:2 = call @Unknown55(%107) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
     %109 = tensor.empty() : tensor<4x512x7x7xf16>
     %110 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%109 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
     %111 = tensor.empty() : tensor<4x512x7x7xf16>
     %112 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %arg84 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%111 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %113:2 = call @Unknown55(%112, %103) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %113:2 = call @Unknown57(%112, %103) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
     %114 = tensor.empty() : tensor<4x512x7x7xf16>
     %115 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%114 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
     %116 = tensor.empty() : tensor<4x512x7x7xf16>
     %117 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %arg94 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%116 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %118:2 = call @Unknown57(%117) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %118:2 = call @Unknown55(%117) : (tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
     %119 = tensor.empty() : tensor<4x512x7x7xf16>
     %120 = byre.compute_on_tensor @ConvOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%119 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
     %121 = tensor.empty() : tensor<4x512x7x7xf16>
     %122 = byre.compute_on_tensor @BatchNormTrainingOp_f16f32f32_f16 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %arg99 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) outs(%121 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %123:2 = call @Unknown59(%122, %113#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
-    %124 = tensor.empty() : tensor<4x512xf16>
-    %125 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<[3, 2]> : tensor<2xi64>} ins(%123#0 : tensor<4x512x7x7xf16>) outs(%124 : tensor<4x512xf16>) : tensor<4x512xf16>
-    %126 = call @Unknown60(%125) : (tensor<4x512xf16>) -> tensor<4x512xf16>
-    %127 = tensor.empty() : tensor<4x1000xf16>
-    %128 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%126, %26 : tensor<4x512xf16>, tensor<1000x512xf16>) outs(%127 : tensor<4x1000xf16>) : tensor<4x1000xf16>
-    %129 = call @Unknown61(%arg103, %128) : (tensor<1000xf32>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
-    %130 = tensor.empty() : tensor<4xf16>
-    %131 = byre.compute_on_tensor @ReduceMaxOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%129 : tensor<4x1000xf16>) outs(%130 : tensor<4xf16>) : tensor<4xf16>
-    %132:2 = call @Unknown62(%131, %129) : (tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
-    %133 = tensor.empty() : tensor<4xf16>
-    %134 = byre.compute_on_tensor @ReduceSumOp_f16_f16 {dimensions = dense<1> : tensor<1xi64>} ins(%132#1 : tensor<4x1000xf16>) outs(%133 : tensor<4xf16>) : tensor<4xf16>
-    %135:3 = call @Unknown63(%134, %132#0, %28, %25, %arg1) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>, tensor<4x1000xf32>) -> (tensor<4x1000xf16>, tensor<4x1000xf32>, tensor<4x1000xf32>)
-    %136 = tensor.empty() : tensor<4x512xf16>
-    %137 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%135#0, %26 : tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%136 : tensor<4x512xf16>) : tensor<4x512xf16>
-    %138 = call @Unknown64(%137, %123#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %139 = tensor.empty() : tensor<4x512x7x7xf16>
-    %140 = tensor.empty() : tensor<512xf32>
-    %141 = tensor.empty() : tensor<512xf32>
-    %142:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %138 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%139, %140, %141 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %143 = tensor.empty() : tensor<4x512x7x7xf16>
-    %144 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%142#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%143 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %145 = tensor.empty() : tensor<512x512x3x3xf16>
-    %146 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %142#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%145 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
-    %147 = call @Unknown68(%118#1, %144) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %148 = tensor.empty() : tensor<4x512x7x7xf16>
-    %149 = tensor.empty() : tensor<512xf32>
-    %150 = tensor.empty() : tensor<512xf32>
-    %151:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %147 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%148, %149, %150 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %152 = tensor.empty() : tensor<4x512x7x7xf16>
-    %153 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%151#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%152 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %154 = tensor.empty() : tensor<512x512x3x3xf16>
-    %155 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %151#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%154 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
-    %156 = call @Unknown72(%138, %153, %113#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
-    %157 = tensor.empty() : tensor<4x512x7x7xf16>
-    %158 = tensor.empty() : tensor<512xf32>
-    %159 = tensor.empty() : tensor<512xf32>
-    %160:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %156 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%157, %158, %159 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %161 = tensor.empty() : tensor<4x512x7x7xf16>
-    %162 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%160#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%161 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
-    %163 = tensor.empty() : tensor<512x512x3x3xf16>
-    %164 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %160#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%163 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
-    %165 = call @Unknown76(%108#1, %162) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
-    %166 = tensor.empty() : tensor<4x512x7x7xf16>
-    %167 = tensor.empty() : tensor<512xf32>
-    %168 = tensor.empty() : tensor<512xf32>
-    %169:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %165 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%166, %167, %168 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %170 = tensor.empty() : tensor<4x256x14x14xf16>
-    %171 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%169#0, %21 : tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) outs(%170 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %172 = tensor.empty() : tensor<512x256x3x3xf16>
-    %173 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %169#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%172 : tensor<512x256x3x3xf16>) : tensor<512x256x3x3xf16>
-    %174 = tensor.empty() : tensor<4x512x7x7xf16>
-    %175 = tensor.empty() : tensor<512xf32>
-    %176 = tensor.empty() : tensor<512xf32>
-    %177:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%101, %arg88, %156 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%174, %175, %176 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
-    %178 = tensor.empty() : tensor<4x256x14x14xf16>
-    %179 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%177#0, %20 : tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) outs(%178 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %180 = tensor.empty() : tensor<512x256x1x1xf16>
-    %181 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %177#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%180 : tensor<512x256x1x1xf16>) : tensor<512x256x1x1xf16>
-    %182 = call @Unknown83(%179, %171, %99#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %183 = tensor.empty() : tensor<4x256x14x14xf16>
-    %184 = tensor.empty() : tensor<256xf32>
-    %185 = tensor.empty() : tensor<256xf32>
-    %186:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %182 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%183, %184, %185 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %187 = tensor.empty() : tensor<4x256x14x14xf16>
-    %188 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%186#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%187 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %189 = tensor.empty() : tensor<256x256x3x3xf16>
-    %190 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %186#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%189 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
-    %191 = call @Unknown87(%94#1, %188) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %192 = tensor.empty() : tensor<4x256x14x14xf16>
-    %193 = tensor.empty() : tensor<256xf32>
-    %194 = tensor.empty() : tensor<256xf32>
-    %195:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %191 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%192, %193, %194 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %196 = tensor.empty() : tensor<4x256x14x14xf16>
-    %197 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%195#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%196 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %198 = tensor.empty() : tensor<256x256x3x3xf16>
-    %199 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %195#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%198 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
-    %200 = call @Unknown91(%182, %197, %89#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
-    %201 = tensor.empty() : tensor<4x256x14x14xf16>
-    %202 = tensor.empty() : tensor<256xf32>
-    %203 = tensor.empty() : tensor<256xf32>
-    %204:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %200 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%201, %202, %203 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %205 = tensor.empty() : tensor<4x256x14x14xf16>
-    %206 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%204#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%205 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
-    %207 = tensor.empty() : tensor<256x256x3x3xf16>
-    %208 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %204#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%207 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
-    %209 = call @Unknown95(%84#1, %206) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
-    %210 = tensor.empty() : tensor<4x256x14x14xf16>
-    %211 = tensor.empty() : tensor<256xf32>
-    %212 = tensor.empty() : tensor<256xf32>
-    %213:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %209 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%210, %211, %212 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %214 = tensor.empty() : tensor<4x128x28x28xf16>
-    %215 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%213#0, %16 : tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) outs(%214 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %216 = tensor.empty() : tensor<256x128x3x3xf16>
-    %217 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %213#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%216 : tensor<256x128x3x3xf16>) : tensor<256x128x3x3xf16>
-    %218 = tensor.empty() : tensor<4x256x14x14xf16>
-    %219 = tensor.empty() : tensor<256xf32>
-    %220 = tensor.empty() : tensor<256xf32>
-    %221:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%77, %arg63, %200 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%218, %219, %220 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
-    %222 = tensor.empty() : tensor<4x128x28x28xf16>
-    %223 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%221#0, %15 : tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) outs(%222 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %224 = tensor.empty() : tensor<256x128x1x1xf16>
-    %225 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %221#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%224 : tensor<256x128x1x1xf16>) : tensor<256x128x1x1xf16>
-    %226 = call @Unknown102(%223, %215, %75#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %227 = tensor.empty() : tensor<4x128x28x28xf16>
-    %228 = tensor.empty() : tensor<128xf32>
-    %229 = tensor.empty() : tensor<128xf32>
-    %230:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %226 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%227, %228, %229 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %231 = tensor.empty() : tensor<4x128x28x28xf16>
-    %232 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%230#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%231 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %233 = tensor.empty() : tensor<128x128x3x3xf16>
-    %234 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %230#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%233 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
-    %235 = call @Unknown106(%70#1, %232) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %236 = tensor.empty() : tensor<4x128x28x28xf16>
-    %237 = tensor.empty() : tensor<128xf32>
-    %238 = tensor.empty() : tensor<128xf32>
-    %239:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %235 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%236, %237, %238 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %240 = tensor.empty() : tensor<4x128x28x28xf16>
-    %241 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%239#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%240 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %242 = tensor.empty() : tensor<128x128x3x3xf16>
-    %243 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %239#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%242 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
-    %244 = call @Unknown110(%226, %241, %65#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
-    %245 = tensor.empty() : tensor<4x128x28x28xf16>
-    %246 = tensor.empty() : tensor<128xf32>
-    %247 = tensor.empty() : tensor<128xf32>
-    %248:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %244 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%245, %246, %247 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %249 = tensor.empty() : tensor<4x128x28x28xf16>
-    %250 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%248#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%249 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
-    %251 = tensor.empty() : tensor<128x128x3x3xf16>
-    %252 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %248#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%251 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
-    %253 = call @Unknown114(%60#1, %250) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
-    %254 = tensor.empty() : tensor<4x128x28x28xf16>
-    %255 = tensor.empty() : tensor<128xf32>
-    %256 = tensor.empty() : tensor<128xf32>
-    %257:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %253 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%254, %255, %256 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %258 = tensor.empty() : tensor<4x64x56x56xf16>
-    %259 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%257#0, %11 : tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) outs(%258 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %260 = tensor.empty() : tensor<128x64x3x3xf16>
-    %261 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %257#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%260 : tensor<128x64x3x3xf16>) : tensor<128x64x3x3xf16>
-    %262 = tensor.empty() : tensor<4x128x28x28xf16>
-    %263 = tensor.empty() : tensor<128xf32>
-    %264 = tensor.empty() : tensor<128xf32>
-    %265:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%53, %arg38, %244 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%262, %263, %264 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
-    %266 = tensor.empty() : tensor<4x64x56x56xf16>
-    %267 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%265#0, %10 : tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) outs(%266 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %268 = tensor.empty() : tensor<128x64x1x1xf16>
-    %269 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %265#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%268 : tensor<128x64x1x1xf16>) : tensor<128x64x1x1xf16>
-    %270 = call @Unknown121(%267, %259, %51#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %271 = tensor.empty() : tensor<4x64x56x56xf16>
-    %272 = tensor.empty() : tensor<64xf32>
-    %273 = tensor.empty() : tensor<64xf32>
-    %274:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %270 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%271, %272, %273 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %275 = tensor.empty() : tensor<4x64x56x56xf16>
-    %276 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%274#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%275 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %277 = tensor.empty() : tensor<64x64x3x3xf16>
-    %278 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %274#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%277 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %279 = call @Unknown125(%46#1, %276) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %280 = tensor.empty() : tensor<4x64x56x56xf16>
-    %281 = tensor.empty() : tensor<64xf32>
-    %282 = tensor.empty() : tensor<64xf32>
-    %283:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %279 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%280, %281, %282 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %284 = tensor.empty() : tensor<4x64x56x56xf16>
-    %285 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%283#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%284 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %286 = tensor.empty() : tensor<64x64x3x3xf16>
-    %287 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %283#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%286 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %288 = call @Unknown129(%270, %285, %41#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
-    %289 = tensor.empty() : tensor<4x64x56x56xf16>
-    %290 = tensor.empty() : tensor<64xf32>
-    %291 = tensor.empty() : tensor<64xf32>
-    %292:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %288 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%289, %290, %291 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %293 = tensor.empty() : tensor<4x64x56x56xf16>
-    %294 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%292#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%293 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %295 = tensor.empty() : tensor<64x64x3x3xf16>
-    %296 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %292#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%295 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %297 = call @Unknown133(%36#1, %294) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %298 = tensor.empty() : tensor<4x64x56x56xf16>
-    %299 = tensor.empty() : tensor<64xf32>
-    %300 = tensor.empty() : tensor<64xf32>
-    %301:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %297 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%298, %299, %300 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
-    %302 = tensor.empty() : tensor<4x64x56x56xf16>
-    %303 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%301#0, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%302 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
-    %304 = tensor.empty() : tensor<64x64x3x3xf16>
-    %305 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %301#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%304 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
-    %306 = call @Unknown137(%288, %303) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
-    %307 = tensor.empty() : tensor<4x64x112x112xf16>
-    %308 = byre.compute_on_tensor @PoolMaxGradOp_f16f16_f16 {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0, %306 : tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>) outs(%307 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16>
-    %309 = call @Unknown138(%29#1, %308) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
-    %310 = tensor.empty() : tensor<4x64x112x112xf16>
-    %311 = tensor.empty() : tensor<64xf32>
-    %312 = tensor.empty() : tensor<64xf32>
-    %313:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %309 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) outs(%310, %311, %312 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
-    %314 = tensor.empty() : tensor<64x3x7x7xf16>
-    %315 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%0, %313#0 : tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) outs(%314 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16>
-    %316 = tensor.empty() : tensor<f32>
-    %317 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<[0, 1]> : tensor<2xi64>} ins(%135#1 : tensor<4x1000xf32>) outs(%316 : tensor<f32>) : tensor<f32>
-    %318 = call @Unknown141(%317) : (tensor<f32>) -> tensor<f32>
-    %319 = call @Unknown142(%315) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
-    %320 = call @Unknown143(%305) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %321 = call @Unknown144(%296) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %322 = call @Unknown145(%287) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %323 = call @Unknown146(%278) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
-    %324 = call @Unknown147(%261) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
-    %325 = call @Unknown148(%252) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %326 = call @Unknown149(%269) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
-    %327 = call @Unknown150(%243) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %328 = call @Unknown151(%234) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
-    %329 = call @Unknown152(%217) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
-    %330 = call @Unknown153(%208) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %331 = call @Unknown154(%225) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
-    %332 = call @Unknown155(%199) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %333 = call @Unknown156(%190) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
-    %334 = call @Unknown157(%173) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
-    %335 = call @Unknown158(%164) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %336 = call @Unknown159(%181) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
-    %337 = call @Unknown160(%155) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %338 = call @Unknown161(%146) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
-    %339 = tensor.empty() : tensor<1000x512xf16>
-    %340 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 0 : i64, output_transpose, rhs_contracting_dimension = 0 : i64} ins(%126, %135#0 : tensor<4x512xf16>, tensor<4x1000xf16>) outs(%339 : tensor<1000x512xf16>) : tensor<1000x512xf16>
-    %341 = call @Unknown163(%340) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
-    %342 = tensor.empty() : tensor<1000xf32>
-    %343 = byre.compute_on_tensor @ReduceSumOp_f32_f32 {dimensions = dense<0> : tensor<1xi64>} ins(%135#2 : tensor<4x1000xf32>) outs(%342 : tensor<1000xf32>) : tensor<1000xf32>
-    %344 = call @Unknown164(%343) : (tensor<1000xf32>) -> tensor<1000xf32>
-    return %318, %319, %313#1, %313#2, %320, %301#1, %301#2, %321, %292#1, %292#2, %322, %283#1, %283#2, %323, %274#1, %274#2, %324, %257#1, %257#2, %325, %248#1, %248#2, %326, %265#1, %265#2, %327, %239#1, %239#2, %328, %230#1, %230#2, %329, %213#1, %213#2, %330, %204#1, %204#2, %331, %221#1, %221#2, %332, %195#1, %195#2, %333, %186#1, %186#2, %334, %169#1, %169#2, %335, %160#1, %160#2, %336, %177#1, %177#2, %337, %151#1, %151#2, %338, %142#1, %142#2, %341, %344 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
+    %123:2 = call @Unknown57(%122, %113#0) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) -> (tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>)
+    %124 = call @Unknown62(%123#0) : (tensor<4x512x7x7xf16>) -> tensor<4x512xf16>
+    %125 = call @Unknown63(%124) : (tensor<4x512xf16>) -> tensor<4x512xf16>
+    %126 = tensor.empty() : tensor<4x1000xf16>
+    %127 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 1 : i64} ins(%125, %26 : tensor<4x512xf16>, tensor<1000x512xf16>) outs(%126 : tensor<4x1000xf16>) : tensor<4x1000xf16>
+    %128 = call @Unknown64(%27, %127) : (tensor<1000xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %129 = call @Unknown65(%128) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %130 = call @Unknown66(%129, %128) : (tensor<4xf16>, tensor<4x1000xf16>) -> tensor<4x1000xf16>
+    %131 = call @Unknown67(%130) : (tensor<4x1000xf16>) -> tensor<4xf16>
+    %132 = call @Unknown68(%131) : (tensor<4xf16>) -> tensor<4xf16>
+    %133:2 = call @Unknown69(%132, %130, %28, %25) : (tensor<4xf16>, tensor<4x1000xf16>, tensor<4xf16>, tensor<4x1000xf16>) -> (tensor<4x1000xf16>, tensor<4x1000xf16>)
+    %134 = tensor.empty() : tensor<4x512xf16>
+    %135 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 1 : i64, rhs_contracting_dimension = 0 : i64} ins(%133#1, %26 : tensor<4x1000xf16>, tensor<1000x512xf16>) outs(%134 : tensor<4x512xf16>) : tensor<4x512xf16>
+    %136 = call @Unknown70(%135, %123#1) : (tensor<4x512xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %137 = tensor.empty() : tensor<4x512x7x7xf16>
+    %138 = tensor.empty() : tensor<512xf32>
+    %139 = tensor.empty() : tensor<512xf32>
+    %140:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%120, %arg98, %136 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%137, %138, %139 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
+    %141 = tensor.empty() : tensor<4x512x7x7xf16>
+    %142 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%140#0, %24 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%141 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
+    %143 = tensor.empty() : tensor<512x512x3x3xf16>
+    %144 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%118#0, %140#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%143 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
+    %145 = call @Unknown74(%118#1, %142) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %146 = tensor.empty() : tensor<4x512x7x7xf16>
+    %147 = tensor.empty() : tensor<512xf32>
+    %148 = tensor.empty() : tensor<512xf32>
+    %149:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%115, %arg93, %145 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%146, %147, %148 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
+    %150 = tensor.empty() : tensor<4x512x7x7xf16>
+    %151 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%149#0, %23 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%150 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
+    %152 = tensor.empty() : tensor<512x512x3x3xf16>
+    %153 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%113#0, %149#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%152 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
+    %154 = call @Unknown78(%136, %151, %113#1) : (tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>, tensor<4x512x7x7xi1>) -> tensor<4x512x7x7xf16>
+    %155 = tensor.empty() : tensor<4x512x7x7xf16>
+    %156 = tensor.empty() : tensor<512xf32>
+    %157 = tensor.empty() : tensor<512xf32>
+    %158:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%110, %arg83, %154 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%155, %156, %157 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
+    %159 = tensor.empty() : tensor<4x512x7x7xf16>
+    %160 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%158#0, %22 : tensor<4x512x7x7xf16>, tensor<512x512x3x3xf16>) outs(%159 : tensor<4x512x7x7xf16>) : tensor<4x512x7x7xf16>
+    %161 = tensor.empty() : tensor<512x512x3x3xf16>
+    %162 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%108#0, %158#0 : tensor<4x512x7x7xf16>, tensor<4x512x7x7xf16>) outs(%161 : tensor<512x512x3x3xf16>) : tensor<512x512x3x3xf16>
+    %163 = call @Unknown74(%108#1, %160) : (tensor<4x512x7x7xi1>, tensor<4x512x7x7xf16>) -> tensor<4x512x7x7xf16>
+    %164 = tensor.empty() : tensor<4x512x7x7xf16>
+    %165 = tensor.empty() : tensor<512xf32>
+    %166 = tensor.empty() : tensor<512xf32>
+    %167:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%105, %arg78, %163 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%164, %165, %166 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
+    %168 = tensor.empty() : tensor<4x256x14x14xf16>
+    %169 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%167#0, %21 : tensor<4x512x7x7xf16>, tensor<512x256x3x3xf16>) outs(%168 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
+    %170 = tensor.empty() : tensor<512x256x3x3xf16>
+    %171 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %167#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%170 : tensor<512x256x3x3xf16>) : tensor<512x256x3x3xf16>
+    %172 = tensor.empty() : tensor<4x512x7x7xf16>
+    %173 = tensor.empty() : tensor<512xf32>
+    %174 = tensor.empty() : tensor<512xf32>
+    %175:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%101, %arg88, %154 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<4x512x7x7xf16>) outs(%172, %173, %174 : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>) : tensor<4x512x7x7xf16>, tensor<512xf32>, tensor<512xf32>
+    %176 = tensor.empty() : tensor<4x256x14x14xf16>
+    %177 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%175#0, %20 : tensor<4x512x7x7xf16>, tensor<512x256x1x1xf16>) outs(%176 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
+    %178 = tensor.empty() : tensor<512x256x1x1xf16>
+    %179 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%99#0, %175#0 : tensor<4x256x14x14xf16>, tensor<4x512x7x7xf16>) outs(%178 : tensor<512x256x1x1xf16>) : tensor<512x256x1x1xf16>
+    %180 = call @Unknown89(%177, %169, %99#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %181 = tensor.empty() : tensor<4x256x14x14xf16>
+    %182 = tensor.empty() : tensor<256xf32>
+    %183 = tensor.empty() : tensor<256xf32>
+    %184:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%96, %arg73, %180 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%181, %182, %183 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
+    %185 = tensor.empty() : tensor<4x256x14x14xf16>
+    %186 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%184#0, %19 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%185 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
+    %187 = tensor.empty() : tensor<256x256x3x3xf16>
+    %188 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%94#0, %184#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%187 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
+    %189 = call @Unknown93(%94#1, %186) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %190 = tensor.empty() : tensor<4x256x14x14xf16>
+    %191 = tensor.empty() : tensor<256xf32>
+    %192 = tensor.empty() : tensor<256xf32>
+    %193:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%91, %arg68, %189 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%190, %191, %192 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
+    %194 = tensor.empty() : tensor<4x256x14x14xf16>
+    %195 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%193#0, %18 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%194 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
+    %196 = tensor.empty() : tensor<256x256x3x3xf16>
+    %197 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%89#0, %193#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%196 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
+    %198 = call @Unknown89(%180, %195, %89#1) : (tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>, tensor<4x256x14x14xi1>) -> tensor<4x256x14x14xf16>
+    %199 = tensor.empty() : tensor<4x256x14x14xf16>
+    %200 = tensor.empty() : tensor<256xf32>
+    %201 = tensor.empty() : tensor<256xf32>
+    %202:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%86, %arg58, %198 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%199, %200, %201 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
+    %203 = tensor.empty() : tensor<4x256x14x14xf16>
+    %204 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%202#0, %17 : tensor<4x256x14x14xf16>, tensor<256x256x3x3xf16>) outs(%203 : tensor<4x256x14x14xf16>) : tensor<4x256x14x14xf16>
+    %205 = tensor.empty() : tensor<256x256x3x3xf16>
+    %206 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%84#0, %202#0 : tensor<4x256x14x14xf16>, tensor<4x256x14x14xf16>) outs(%205 : tensor<256x256x3x3xf16>) : tensor<256x256x3x3xf16>
+    %207 = call @Unknown93(%84#1, %204) : (tensor<4x256x14x14xi1>, tensor<4x256x14x14xf16>) -> tensor<4x256x14x14xf16>
+    %208 = tensor.empty() : tensor<4x256x14x14xf16>
+    %209 = tensor.empty() : tensor<256xf32>
+    %210 = tensor.empty() : tensor<256xf32>
+    %211:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%81, %arg53, %207 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%208, %209, %210 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
+    %212 = tensor.empty() : tensor<4x128x28x28xf16>
+    %213 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%211#0, %16 : tensor<4x256x14x14xf16>, tensor<256x128x3x3xf16>) outs(%212 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
+    %214 = tensor.empty() : tensor<256x128x3x3xf16>
+    %215 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %211#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%214 : tensor<256x128x3x3xf16>) : tensor<256x128x3x3xf16>
+    %216 = tensor.empty() : tensor<4x256x14x14xf16>
+    %217 = tensor.empty() : tensor<256xf32>
+    %218 = tensor.empty() : tensor<256xf32>
+    %219:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%77, %arg63, %198 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<4x256x14x14xf16>) outs(%216, %217, %218 : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>) : tensor<4x256x14x14xf16>, tensor<256xf32>, tensor<256xf32>
+    %220 = tensor.empty() : tensor<4x128x28x28xf16>
+    %221 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%219#0, %15 : tensor<4x256x14x14xf16>, tensor<256x128x1x1xf16>) outs(%220 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
+    %222 = tensor.empty() : tensor<256x128x1x1xf16>
+    %223 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%75#0, %219#0 : tensor<4x128x28x28xf16>, tensor<4x256x14x14xf16>) outs(%222 : tensor<256x128x1x1xf16>) : tensor<256x128x1x1xf16>
+    %224 = call @Unknown108(%221, %213, %75#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %225 = tensor.empty() : tensor<4x128x28x28xf16>
+    %226 = tensor.empty() : tensor<128xf32>
+    %227 = tensor.empty() : tensor<128xf32>
+    %228:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%72, %arg48, %224 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%225, %226, %227 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
+    %229 = tensor.empty() : tensor<4x128x28x28xf16>
+    %230 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%228#0, %14 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%229 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
+    %231 = tensor.empty() : tensor<128x128x3x3xf16>
+    %232 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%70#0, %228#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%231 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
+    %233 = call @Unknown112(%70#1, %230) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %234 = tensor.empty() : tensor<4x128x28x28xf16>
+    %235 = tensor.empty() : tensor<128xf32>
+    %236 = tensor.empty() : tensor<128xf32>
+    %237:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%67, %arg43, %233 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%234, %235, %236 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
+    %238 = tensor.empty() : tensor<4x128x28x28xf16>
+    %239 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%237#0, %13 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%238 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
+    %240 = tensor.empty() : tensor<128x128x3x3xf16>
+    %241 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%65#0, %237#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%240 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
+    %242 = call @Unknown108(%224, %239, %65#1) : (tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>, tensor<4x128x28x28xi1>) -> tensor<4x128x28x28xf16>
+    %243 = tensor.empty() : tensor<4x128x28x28xf16>
+    %244 = tensor.empty() : tensor<128xf32>
+    %245 = tensor.empty() : tensor<128xf32>
+    %246:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%62, %arg33, %242 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%243, %244, %245 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
+    %247 = tensor.empty() : tensor<4x128x28x28xf16>
+    %248 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%246#0, %12 : tensor<4x128x28x28xf16>, tensor<128x128x3x3xf16>) outs(%247 : tensor<4x128x28x28xf16>) : tensor<4x128x28x28xf16>
+    %249 = tensor.empty() : tensor<128x128x3x3xf16>
+    %250 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%60#0, %246#0 : tensor<4x128x28x28xf16>, tensor<4x128x28x28xf16>) outs(%249 : tensor<128x128x3x3xf16>) : tensor<128x128x3x3xf16>
+    %251 = call @Unknown112(%60#1, %248) : (tensor<4x128x28x28xi1>, tensor<4x128x28x28xf16>) -> tensor<4x128x28x28xf16>
+    %252 = tensor.empty() : tensor<4x128x28x28xf16>
+    %253 = tensor.empty() : tensor<128xf32>
+    %254 = tensor.empty() : tensor<128xf32>
+    %255:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%57, %arg28, %251 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%252, %253, %254 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
+    %256 = tensor.empty() : tensor<4x64x56x56xf16>
+    %257 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%255#0, %11 : tensor<4x128x28x28xf16>, tensor<128x64x3x3xf16>) outs(%256 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %258 = tensor.empty() : tensor<128x64x3x3xf16>
+    %259 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %255#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%258 : tensor<128x64x3x3xf16>) : tensor<128x64x3x3xf16>
+    %260 = tensor.empty() : tensor<4x128x28x28xf16>
+    %261 = tensor.empty() : tensor<128xf32>
+    %262 = tensor.empty() : tensor<128xf32>
+    %263:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%53, %arg38, %242 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<4x128x28x28xf16>) outs(%260, %261, %262 : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>) : tensor<4x128x28x28xf16>, tensor<128xf32>, tensor<128xf32>
+    %264 = tensor.empty() : tensor<4x64x56x56xf16>
+    %265 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%263#0, %10 : tensor<4x128x28x28xf16>, tensor<128x64x1x1xf16>) outs(%264 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %266 = tensor.empty() : tensor<128x64x1x1xf16>
+    %267 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%51#0, %263#0 : tensor<4x64x56x56xf16>, tensor<4x128x28x28xf16>) outs(%266 : tensor<128x64x1x1xf16>) : tensor<128x64x1x1xf16>
+    %268 = call @Unknown127(%265, %257, %51#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %269 = tensor.empty() : tensor<4x64x56x56xf16>
+    %270 = tensor.empty() : tensor<64xf32>
+    %271 = tensor.empty() : tensor<64xf32>
+    %272:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%48, %arg23, %268 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%269, %270, %271 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
+    %273 = tensor.empty() : tensor<4x64x56x56xf16>
+    %274 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%272#0, %9 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%273 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %275 = tensor.empty() : tensor<64x64x3x3xf16>
+    %276 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%46#0, %272#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%275 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
+    %277 = call @Unknown131(%46#1, %274) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %278 = tensor.empty() : tensor<4x64x56x56xf16>
+    %279 = tensor.empty() : tensor<64xf32>
+    %280 = tensor.empty() : tensor<64xf32>
+    %281:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%43, %arg18, %277 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%278, %279, %280 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
+    %282 = tensor.empty() : tensor<4x64x56x56xf16>
+    %283 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%281#0, %8 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%282 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %284 = tensor.empty() : tensor<64x64x3x3xf16>
+    %285 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%41#0, %281#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%284 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
+    %286 = call @Unknown127(%268, %283, %41#1) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>, tensor<4x64x56x56xi1>) -> tensor<4x64x56x56xf16>
+    %287 = tensor.empty() : tensor<4x64x56x56xf16>
+    %288 = tensor.empty() : tensor<64xf32>
+    %289 = tensor.empty() : tensor<64xf32>
+    %290:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%38, %arg13, %286 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%287, %288, %289 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
+    %291 = tensor.empty() : tensor<4x64x56x56xf16>
+    %292 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%290#0, %7 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%291 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %293 = tensor.empty() : tensor<64x64x3x3xf16>
+    %294 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%36#0, %290#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%293 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
+    %295 = call @Unknown131(%36#1, %292) : (tensor<4x64x56x56xi1>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %296 = tensor.empty() : tensor<4x64x56x56xf16>
+    %297 = tensor.empty() : tensor<64xf32>
+    %298 = tensor.empty() : tensor<64xf32>
+    %299:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%33, %arg8, %295 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<4x64x56x56xf16>) outs(%296, %297, %298 : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x56x56xf16>, tensor<64xf32>, tensor<64xf32>
+    %300 = tensor.empty() : tensor<4x64x56x56xf16>
+    %301 = byre.compute_on_tensor @ConvBackwardDataOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%299#0, %6 : tensor<4x64x56x56xf16>, tensor<64x64x3x3xf16>) outs(%300 : tensor<4x64x56x56xf16>) : tensor<4x64x56x56xf16>
+    %302 = tensor.empty() : tensor<64x64x3x3xf16>
+    %303 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} ins(%31, %299#0 : tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) outs(%302 : tensor<64x64x3x3xf16>) : tensor<64x64x3x3xf16>
+    %304 = call @Unknown143(%286, %301) : (tensor<4x64x56x56xf16>, tensor<4x64x56x56xf16>) -> tensor<4x64x56x56xf16>
+    %305 = tensor.empty() : tensor<4x64x112x112xf16>
+    %306 = byre.compute_on_tensor @PoolMaxGradOp_f16f16_f16 {padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} ins(%29#0, %304 : tensor<4x64x112x112xf16>, tensor<4x64x56x56xf16>) outs(%305 : tensor<4x64x112x112xf16>) : tensor<4x64x112x112xf16>
+    %307 = call @Unknown144(%29#1, %306) : (tensor<4x64x112x112xi1>, tensor<4x64x112x112xf16>) -> tensor<4x64x112x112xf16>
+    %308 = tensor.empty() : tensor<4x64x112x112xf16>
+    %309 = tensor.empty() : tensor<64xf32>
+    %310 = tensor.empty() : tensor<64xf32>
+    %311:3 = byre.compute_on_tensor @BatchNormGradOp_f16f32f16_f16f32f32 {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64} ins(%3, %arg3, %307 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<4x64x112x112xf16>) outs(%308, %309, %310 : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>) : tensor<4x64x112x112xf16>, tensor<64xf32>, tensor<64xf32>
+    %312 = tensor.empty() : tensor<64x3x7x7xf16>
+    %313 = byre.compute_on_tensor @ConvBackwardFilterOp_f16f16_f16 {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} ins(%0, %311#0 : tensor<4x3x224x224xf16>, tensor<4x64x112x112xf16>) outs(%312 : tensor<64x3x7x7xf16>) : tensor<64x3x7x7xf16>
+    %314 = call @Unknown147(%133#0, %arg1) : (tensor<4x1000xf16>, tensor<4x1000xf32>) -> tensor<f32>
+    %315 = call @Unknown148(%314) : (tensor<f32>) -> tensor<f32>
+    %316 = call @Unknown149(%313) : (tensor<64x3x7x7xf16>) -> tensor<64x3x7x7xf32>
+    %317 = call @Unknown150(%303) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %318 = call @Unknown150(%294) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %319 = call @Unknown150(%285) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %320 = call @Unknown150(%276) : (tensor<64x64x3x3xf16>) -> tensor<64x64x3x3xf32>
+    %321 = call @Unknown154(%259) : (tensor<128x64x3x3xf16>) -> tensor<128x64x3x3xf32>
+    %322 = call @Unknown155(%250) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %323 = call @Unknown156(%267) : (tensor<128x64x1x1xf16>) -> tensor<128x64x1x1xf32>
+    %324 = call @Unknown155(%241) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %325 = call @Unknown155(%232) : (tensor<128x128x3x3xf16>) -> tensor<128x128x3x3xf32>
+    %326 = call @Unknown159(%215) : (tensor<256x128x3x3xf16>) -> tensor<256x128x3x3xf32>
+    %327 = call @Unknown160(%206) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %328 = call @Unknown161(%223) : (tensor<256x128x1x1xf16>) -> tensor<256x128x1x1xf32>
+    %329 = call @Unknown160(%197) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %330 = call @Unknown160(%188) : (tensor<256x256x3x3xf16>) -> tensor<256x256x3x3xf32>
+    %331 = call @Unknown164(%171) : (tensor<512x256x3x3xf16>) -> tensor<512x256x3x3xf32>
+    %332 = call @Unknown165(%162) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %333 = call @Unknown166(%179) : (tensor<512x256x1x1xf16>) -> tensor<512x256x1x1xf32>
+    %334 = call @Unknown165(%153) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %335 = call @Unknown165(%144) : (tensor<512x512x3x3xf16>) -> tensor<512x512x3x3xf32>
+    %336 = tensor.empty() : tensor<1000x512xf16>
+    %337 = byre.compute_on_tensor @MatmulOp_f16f16_f16 {lhs_contracting_dimension = 0 : i64, output_transpose, rhs_contracting_dimension = 0 : i64} ins(%125, %133#1 : tensor<4x512xf16>, tensor<4x1000xf16>) outs(%336 : tensor<1000x512xf16>) : tensor<1000x512xf16>
+    %338 = call @Unknown170(%337) : (tensor<1000x512xf16>) -> tensor<1000x512xf32>
+    %339 = call @Unknown171(%133#1) : (tensor<4x1000xf16>) -> tensor<1000xf32>
+    %340 = call @Unknown172(%339) : (tensor<1000xf32>) -> tensor<1000xf32>
+    return %315, %316, %311#1, %311#2, %317, %299#1, %299#2, %318, %290#1, %290#2, %319, %281#1, %281#2, %320, %272#1, %272#2, %321, %255#1, %255#2, %322, %246#1, %246#2, %323, %263#1, %263#2, %324, %237#1, %237#2, %325, %228#1, %228#2, %326, %211#1, %211#2, %327, %202#1, %202#2, %328, %219#1, %219#2, %329, %193#1, %193#2, %330, %184#1, %184#2, %331, %167#1, %167#2, %332, %158#1, %158#2, %333, %175#1, %175#2, %334, %149#1, %149#2, %335, %140#1, %140#2, %338, %340 : tensor<f32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1000x512xf32>, tensor<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir b/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir
index 0a797cefb..9f4640116 100644
--- a/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/5_affine_opt.mlir
@@ -2,676 +2,1631 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map5 = affine_map<() -> ()>
-#map6 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)>
+#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
+#map6 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map8 = affine_map<(d0) -> (d0 mod 128, 125)>
+#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)>
+#map10 = affine_map<(d0)[s0] -> (d0 * 32 + s0)>
+#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)>
+#map12 = affine_map<(d0) -> (d0 * 32)>
+#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)>
+#map14 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)>
 module @IrToMhlo.2452 {
   func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c224 = arith.constant 224 : index
     %alloc = memref.alloc() : memref<4x3x224x224xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x3x224x224xf32>) outs(%alloc : memref<4x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c224 step %c1 {
+          scf.for %arg4 = %c0 to %c224 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf16>
   }
   func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf16>
   }
   func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf16>
   }
   func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant -2.500000e-01 : f32
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x1000xf32>) outs(%alloc : memref<4x1000xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.mulf %in, %cst : f32
-      %1 = arith.truncf %0 : f32 to f16
-      linalg.yield %1 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.mulf %in, %cst : f32
+          %1 = arith.truncf %0 : f32 to f16
+          linalg.yield %1 : f16
+        }
+      }
     }
     return %alloc : memref<4x1000xf16>
   }
   func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %alloc = memref.alloc() : memref<1000xf16>
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %out: f16):
+        %0 = arith.truncf %in : f32 to f16
+        linalg.yield %0 : f16
+      }
+    }
+    return %alloc : memref<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %9 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %9 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        %6 = arith.cmpi ugt, %2, %c1 : index
+        %7 = scf.if %6 -> (f16) {
+          %9 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %9 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %8 = arith.addf %5, %7 : f16
+        memref.store %8, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
     %alloc_0 = memref.alloc() : memref<4x64x112x112xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x112x112xf16>) outs(%alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c112 step %c1 {
+          scf.for %arg4 = %c0 to %c112 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>
   }
-  func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c56 step %c1 {
+          scf.for %arg4 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c28 step %c1 {
+          scf.for %arg4 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c128 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          scf.for %arg5 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c14 step %c1 {
+          scf.for %arg4 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c256 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          scf.for %arg5 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16>
+    %alloc = memref.alloc() : memref<2048xf16>
+    scf.forall (%arg1) in (2048) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.min #map6(%arg2)
+        %1 = affine.min #map7(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %6 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<2048xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16>
+    return %expand_shape : memref<4x512xf16>
+  }
+  func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 2.040100e-02 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<4x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x512xf16>) outs(%alloc : memref<4x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.mulf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f16):
+          %0 = arith.mulf %in, %cst : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<1000xf32>) outs(%alloc : memref<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %0 = arith.truncf %in_0 : f32 to f16
-      %1 = arith.addf %in, %0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_2: f16, %out: f16):
+          %0 = arith.addf %in_2, %in : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %8 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %8 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.cmpi ugt, %2, %c1 : index
+        %6 = scf.if %5 -> (f16) {
+          %8 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %8 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %7 = arith.maximumf %4, %6 : f16
+        memref.store %7, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<4xf16>) outs(%alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: f16):
-      %0 = arith.subf %in, %in_1 : f16
-      %1 = math.exp %0 : f16
-      linalg.yield %0, %1 : f16, f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg2] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_2: f16, %out: f16):
+          %0 = arith.subf %in_2, %in : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
-    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
+    return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %11 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %11 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = math.exp %4 : f16
+        %6 = arith.addf %5, %cst : f16
+        %7 = arith.cmpi ugt, %2, %c1 : index
+        %8 = scf.if %7 -> (f16) {
+          %11 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %11 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %9 = math.exp %8 : f16
+        %10 = arith.addf %6, %9 : f16
+        memref.store %10, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f16):
+        %0 = math.log %in : f16
+        linalg.yield %0 : f16
+      }
+    }
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf32>
-    %alloc_1 = memref.alloc() : memref<4x1000xf32>
-    linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : memref<4x1000xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4xf16>, memref<4x1000xf32>) outs(%alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) {
-    ^bb0(%in: f16, %in_2: f16, %in_3: f16, %in_4: f16, %in_5: f32, %out: f16, %out_6: f32, %out_7: f32):
-      %0 = math.log %in_3 : f16
-      %1 = arith.subf %in_2, %0 : f16
-      %2 = math.exp %1 : f16
-      %3 = arith.mulf %2, %in_4 : f16
-      %4 = arith.subf %in, %3 : f16
-      %5 = arith.extf %1 : f16 to f32
-      %6 = arith.mulf %5, %in_5 : f32
-      %7 = arith.extf %4 : f16 to f32
-      linalg.yield %4, %6, %7 : f16, f32, f32
+    %alloc_0 = memref.alloc() : memref<4x1000xf16>
+    scf.for %arg4 = %c0 to %c4 step %c1 {
+      scf.for %arg5 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg2[%arg4] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %alloc_0[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_2 = memref.subview %alloc[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_3 = memref.subview %arg0[%arg4] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_4 = memref.subview %arg1[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_5 = memref.subview %arg3[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3, %subview_4, %subview_5 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_6: f16, %in_7: f16, %in_8: f16, %out: f16, %out_9: f16):
+          %0 = arith.subf %in_7, %in_6 : f16
+          %1 = math.exp %0 : f16
+          %2 = arith.mulf %1, %in : f16
+          %3 = arith.subf %in_8, %2 : f16
+          linalg.yield %0, %3 : f16, f16
+        }
+      }
     }
-    return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>
+    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
   }
-  func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.900000e+01 : f16
     %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x512x7x7xi1>, memref<4x512xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_1: f16, %out: f16):
-      %0 = arith.divf %in_1, %cst : f16
-      %1 = arith.select %in, %0, %cst_0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_1 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: i1, %out: f16):
+              %0 = arith.divf %in, %cst : f16
+              %1 = arith.select %in_3, %0, %cst_0 : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c512 step %c1 {
+        scf.for %arg5 = %c0 to %c7 step %c1 {
+          scf.for %arg6 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x512x7x7xf16>
-  }
-  func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c256 step %c1 {
+        scf.for %arg5 = %c0 to %c14 step %c1 {
+          scf.for %arg6 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c256 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          scf.for %arg5 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c128 step %c1 {
+        scf.for %arg5 = %c0 to %c28 step %c1 {
+          scf.for %arg6 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c128 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          scf.for %arg5 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c64 step %c1 {
+        scf.for %arg5 = %c0 to %c56 step %c1 {
+          scf.for %arg6 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+  func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16):
+              %0 = arith.addf %in, %in_2 : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) outs(%alloc : memref<4x64x112x112xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c112 step %c1 {
+          scf.for %arg5 = %c0 to %c112 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x112x112xf16>
   }
-  func.func private @Unknown141(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref<f32> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<f32>
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16>
+    %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32>
+    %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16>
+    %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32>
+    %alloc_3 = memref.alloc() : memref<32xf32>
+    scf.forall (%arg2) in (32) {
+      %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (128) {
+        %0 = affine.min #map8(%arg3)
+        %1 = affine.min #map9(%arg3)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_13 = memref.subview %expand_shape_4[0, %0] [1, %2] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %subview_15 = memref.subview %expand_shape_6[0, %0] [1, %2] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+        %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4:2 = scf.if %3 -> (f16, f32) {
+          %8 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          %9 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+          scf.yield %8, %9 : f16, f32
+        } else {
+          scf.yield %cst_0, %cst : f16, f32
+        }
+        %5 = arith.extf %4#0 : f16 to f32
+        %6 = arith.mulf %5, %4#1 : f32
+        %7 = arith.addf %6, %cst : f32
+        memref.store %7, %alloca[%arg3] : memref<128xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (64) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc_3[%arg2] : memref<32xf32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    scf.forall (%arg2) in (1) {
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = affine.apply #map10(%arg2)[%arg3]
+        %1 = memref.load %alloc_3[%0] : memref<32xf32>
+        %2 = arith.addf %1, %cst : f32
+        memref.store %2, %alloca[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc[] : memref<f32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<f32>
+  }
+  func.func private @Unknown148(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.000000e+00 : f32
     %alloc = memref.alloc() : memref<f32>
-    linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : memref<f32>) outs(%alloc : memref<f32>) {
+    linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : memref<f32>) outs(%alloc : memref<f32>) {
     ^bb0(%in: f32, %out: f32):
       %0 = arith.negf %in : f32
       %1 = arith.divf %0, %cst : f32
@@ -679,202 +1634,335 @@ module @IrToMhlo.2452 {
     }
     return %alloc : memref<f32>
   }
-  func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs =  {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs =  {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs =  {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs =  {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs =  {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs =  {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs =  {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) attrs =  {xla_shape = "f32[1000,512]{0,1}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1000xf32>
+    scf.forall (%arg1) in (32) {
+      %0 = affine.min #map11(%arg1)
+      %1 = affine.apply #map12(%arg1)
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg2, %arg3) in (2, 32) {
+        %2 = affine.min #map13(%arg3, %arg1)
+        %3 = affine.min #map14(%arg3, %arg1)
+        %4 = affine.apply #map3(%3, %2)
+        %5 = arith.cmpi ugt, %4, %c0 : index
+        %6 = scf.if %5 -> (f16) {
+          %12 = affine.apply #map4(%arg2)
+          %13 = affine.apply #map10(%arg1)[%2]
+          %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16>
+          scf.yield %14 : f16
+        } else {
+          scf.yield %cst_0 : f16
+        }
+        %7 = arith.extf %6 : f16 to f32
+        %8 = arith.addf %7, %cst : f32
+        %9 = scf.if %5 -> (f16) {
+          %12 = affine.apply #map5(%arg2)
+          %13 = affine.apply #map10(%arg1)[%2]
+          %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16>
+          scf.yield %14 : f16
+        } else {
+          scf.yield %cst_0 : f16
+        }
+        %10 = arith.extf %9 : f16 to f32
+        %11 = arith.addf %8, %10 : f32
+        memref.store %11, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      scf.forall (%arg2) in (32) {
+        %2 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %3 = arith.addf %2, %cst : f32
+        %4 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %3 : f32
+        memref.store %5, %alloca[%arg2] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %subview = memref.subview %alloca[0] [%0] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %alloc[%1] [%0] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<1000xf32>
-    linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %0 = arith.truncf %in : f32 to f16
-      %1 = arith.extf %0 : f16 to f32
-      linalg.yield %1 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %out: f32):
+        %0 = arith.truncf %in : f32 to f16
+        %1 = arith.extf %0 : f16 to f32
+        linalg.yield %1 : f32
+      }
     }
     return %alloc : memref<1000xf32>
   }
@@ -886,344 +1974,340 @@ module @IrToMhlo.2452 {
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16>
     %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16>
     %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16>
     %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16>
     %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16>
     %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_1 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16>
+    %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16>
+    %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %alloc_1 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
     %alloc_2 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
+    byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_3 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_4 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_5 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_6 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_7 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_8 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_10 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    %alloc_10 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
     %alloc_11 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
     %alloc_12 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_13 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_14 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_15 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_16 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_17 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_18 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_19 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_20 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    %alloc_20 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
     %alloc_21 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
     %alloc_22 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_23 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_24 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_25 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_26 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_27 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_28 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_29 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_30 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    %alloc_30 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
     %alloc_31 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
     %alloc_32 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_33 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_34 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_35 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_36 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_37 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_38 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_39 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_40 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16>
+    %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16>
+    %alloc_40 = memref.alloc() : memref<4x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
+    %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16>
+    %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16>
+    %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16>
+    %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
     %alloc_41 = memref.alloc() : memref<4x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16>
-    %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16>
-    %alloc_42 = memref.alloc() : memref<4x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
-    %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16>
-    %alloc_43 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
-    %alloc_44 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>)
-    %alloc_45 = memref.alloc() : memref<4x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
-    %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_46 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_47 = memref.alloc() : memref<512xf32>
+    byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
+    %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_42 = memref.alloc() : memref<4x512x7x7xf16>
+    %alloc_43 = memref.alloc() : memref<512xf32>
+    %alloc_44 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_45 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_46 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_47 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_48 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_49 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_50 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_51 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_52 = memref.alloc() : memref<512xf32>
+    %alloc_49 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_50 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_51 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_52 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_53 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_54 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_55 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_56 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_57 = memref.alloc() : memref<512xf32>
+    %alloc_54 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_55 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_56 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_57 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_58 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_59 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_60 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_61 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_62 = memref.alloc() : memref<512xf32>
+    %alloc_59 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_60 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_61 = memref.alloc() : memref<512x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
+    %alloc_62 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_63 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_64 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_65 = memref.alloc() : memref<512x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
-    %alloc_66 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_67 = memref.alloc() : memref<512xf32>
-    %alloc_68 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_69 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
-    %alloc_70 = memref.alloc() : memref<512x256x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
-    %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_71 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_72 = memref.alloc() : memref<256xf32>
+    %alloc_64 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_65 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
+    %alloc_66 = memref.alloc() : memref<512x256x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
+    %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_67 = memref.alloc() : memref<4x256x14x14xf16>
+    %alloc_68 = memref.alloc() : memref<256xf32>
+    %alloc_69 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_70 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_71 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_72 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_73 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_74 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_75 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_76 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_77 = memref.alloc() : memref<256xf32>
+    %alloc_74 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_75 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_76 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_77 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_78 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_79 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_80 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_81 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_82 = memref.alloc() : memref<256xf32>
+    %alloc_79 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_80 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_81 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_82 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_83 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_84 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_85 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_86 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_87 = memref.alloc() : memref<256xf32>
+    %alloc_84 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_85 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_86 = memref.alloc() : memref<256x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
+    %alloc_87 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_88 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_89 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_90 = memref.alloc() : memref<256x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
-    %alloc_91 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_92 = memref.alloc() : memref<256xf32>
-    %alloc_93 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_94 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
-    %alloc_95 = memref.alloc() : memref<256x128x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
-    %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_96 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_97 = memref.alloc() : memref<128xf32>
+    %alloc_89 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_90 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
+    %alloc_91 = memref.alloc() : memref<256x128x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
+    %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_92 = memref.alloc() : memref<4x128x28x28xf16>
+    %alloc_93 = memref.alloc() : memref<128xf32>
+    %alloc_94 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_95 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_96 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_97 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_98 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_99 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_100 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_101 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_102 = memref.alloc() : memref<128xf32>
+    %alloc_99 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_100 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_101 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_102 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_103 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_104 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_105 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_106 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_107 = memref.alloc() : memref<128xf32>
+    %alloc_104 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_105 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_106 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_107 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_108 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_109 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_110 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_111 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_112 = memref.alloc() : memref<128xf32>
+    %alloc_109 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_110 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_111 = memref.alloc() : memref<128x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
+    %alloc_112 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_113 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_114 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_115 = memref.alloc() : memref<128x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
-    %alloc_116 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_117 = memref.alloc() : memref<128xf32>
-    %alloc_118 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_119 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
-    %alloc_120 = memref.alloc() : memref<128x64x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
-    %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_121 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_122 = memref.alloc() : memref<64xf32>
+    %alloc_114 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_115 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
+    %alloc_116 = memref.alloc() : memref<128x64x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
+    %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_117 = memref.alloc() : memref<4x64x56x56xf16>
+    %alloc_118 = memref.alloc() : memref<64xf32>
+    %alloc_119 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_120 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_121 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_122 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_123 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_124 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_125 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_126 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_127 = memref.alloc() : memref<64xf32>
+    %alloc_124 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_125 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_126 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_127 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_128 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_129 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_130 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_131 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_132 = memref.alloc() : memref<64xf32>
+    %alloc_129 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_130 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_131 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_132 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_133 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_134 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_135 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_136 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_137 = memref.alloc() : memref<64xf32>
-    %alloc_138 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_139 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_140 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_141 = memref.alloc() : memref<4x64x112x112xf16>
-    byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
-    %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
-    %alloc_142 = memref.alloc() : memref<4x64x112x112xf16>
-    %alloc_143 = memref.alloc() : memref<64xf32>
-    %alloc_144 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_145 = memref.alloc() : memref<64x3x7x7xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
-    %alloc_146 = memref.alloc() : memref<f32>
-    byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<f32>
-    %62 = call @Unknown141(%alloc_146) : (memref<f32>) -> memref<f32>
-    %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %alloc_147 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
-    %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %alloc_148 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32>
-    %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32>
-    return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
+    %alloc_134 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_135 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_136 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_137 = memref.alloc() : memref<4x64x112x112xf16>
+    byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
+    %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
+    %alloc_138 = memref.alloc() : memref<4x64x112x112xf16>
+    %alloc_139 = memref.alloc() : memref<64xf32>
+    %alloc_140 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_141 = memref.alloc() : memref<64x3x7x7xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
+    %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref<f32>
+    %69 = call @Unknown148(%68) : (memref<f32>) -> memref<f32>
+    %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
+    %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %alloc_142 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
+    %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32>
+    %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32>
+    return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir b/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir
index 53e23b190..a0b85713c 100644
--- a/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/5_alternative_scf_opt.mlir
@@ -2,676 +2,1631 @@
 
 // CHECK-LABEL: func.func @main
 
-#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d1)>
-#map3 = affine_map<(d0, d1) -> (d0)>
-#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
-#map5 = affine_map<() -> ()>
-#map6 = affine_map<(d0) -> (d0)>
+#map = affine_map<() -> ()>
+#map1 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024, 1000)>
+#map2 = affine_map<(d0) -> (d0 * 2 - (d0 floordiv 512) * 1024 + 2, 1000)>
+#map3 = affine_map<(d0, d1) -> (d0 - d1)>
+#map4 = affine_map<(d0) -> (d0 * 2)>
+#map5 = affine_map<(d0) -> (d0 * 2 + 1)>
+#map6 = affine_map<(d0) -> (d0 mod 64, 49)>
+#map7 = affine_map<(d0) -> (d0 mod 64 + 1, 49)>
+#map8 = affine_map<(d0) -> (d0 mod 128, 125)>
+#map9 = affine_map<(d0) -> (d0 mod 128 + 1, 125)>
+#map10 = affine_map<(d0)[s0] -> (d0 * 32 + s0)>
+#map11 = affine_map<(d0) -> (d0 * -32 + 1000, 32)>
+#map12 = affine_map<(d0) -> (d0 * 32)>
+#map13 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0)>
+#map14 = affine_map<(d0, d1) -> (d1 * -32 + 1000, 32, d0 + 1)>
 module @IrToMhlo.2452 {
   func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c224 = arith.constant 224 : index
     %alloc = memref.alloc() : memref<4x3x224x224xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x3x224x224xf32>) outs(%alloc : memref<4x3x224x224xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c224 step %c1 {
+          scf.for %arg4 = %c0 to %c224 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x3x224x224xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf32>) outs(%alloc : memref<64x3x7x7xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf32>) outs(%alloc : memref<64x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf16>
   }
   func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf32>) outs(%alloc : memref<128x64x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf32>) outs(%alloc : memref<128x64x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf32>) outs(%alloc : memref<128x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf16>
   }
   func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf32>) outs(%alloc : memref<256x128x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf32>) outs(%alloc : memref<256x128x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf32>) outs(%alloc : memref<256x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf16>
   }
   func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf32>) outs(%alloc : memref<512x256x1x1xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf32>) outs(%alloc : memref<512x256x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf32>) outs(%alloc : memref<512x512x3x3xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f32, %out: f16):
+              %0 = arith.truncf %in : f32 to f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant -2.500000e-01 : f32
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x1000xf32>) outs(%alloc : memref<4x1000xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.mulf %in, %cst : f32
-      %1 = arith.truncf %0 : f32 to f16
-      linalg.yield %1 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.mulf %in, %cst : f32
+          %1 = arith.truncf %0 : f32 to f16
+          linalg.yield %1 : f16
+        }
+      }
     }
     return %alloc : memref<4x1000xf16>
   }
   func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf32>) outs(%alloc : memref<1000x512xf16>) {
-    ^bb0(%in: f32, %out: f16):
-      %0 = arith.truncf %in : f32 to f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f32, %out: f16):
+          %0 = arith.truncf %in : f32 to f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %alloc = memref.alloc() : memref<1000xf16>
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %out: f16):
+        %0 = arith.truncf %in : f32 to f16
+        linalg.yield %0 : f16
+      }
+    }
+    return %alloc : memref<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %9 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %9 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        %6 = arith.cmpi ugt, %2, %c1 : index
+        %7 = scf.if %6 -> (f16) {
+          %9 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %9 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %8 = arith.addf %5, %7 : f16
+        memref.store %8, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
     %alloc_0 = memref.alloc() : memref<4x64x112x112xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x112x112xf16>) outs(%alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c112 step %c1 {
+          scf.for %arg4 = %c0 to %c112 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>
   }
-  func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c56 step %c1 {
+          scf.for %arg4 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c28 step %c1 {
+          scf.for %arg4 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c128 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          scf.for %arg5 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c14 step %c1 {
+          scf.for %arg4 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c256 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          scf.for %arg5 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f16, %out_3: i1):
+              %0 = arith.maximumf %in, %cst : f16
+              %1 = arith.cmpf ogt, %0, %cst : f16
+              linalg.yield %0, %1 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc_0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_2 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_3 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_4: f16, %out: f16, %out_5: i1):
+              %0 = arith.addf %in, %in_4 : f16
+              %1 = arith.maximumf %0, %cst : f16
+              %2 = arith.cmpf ogt, %1, %cst : f16
+              linalg.yield %1, %2 : f16, i1
+            }
+          }
+        }
+      }
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %out: f16, %out_1: i1):
-      %0 = arith.maxnumf %in, %cst : f16
-      %1 = arith.cmpf ogt, %0, %cst : f16
-      linalg.yield %0, %1 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: i1):
-      %0 = arith.addf %in, %in_1 : f16
-      %1 = arith.maxnumf %0, %cst : f16
-      %2 = arith.cmpf ogt, %1, %cst : f16
-      linalg.yield %1, %2 : f16, i1
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16>
+    %alloc = memref.alloc() : memref<2048xf16>
+    scf.forall (%arg1) in (2048) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.min #map6(%arg2)
+        %1 = affine.min #map7(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_6 = memref.subview %expand_shape_0[0, %0] [1, %2] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %6 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %6 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.addf %4, %cst : f16
+        memref.store %5, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<2048xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16>
+    return %expand_shape : memref<4x512xf16>
+  }
+  func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 2.040100e-02 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<4x512xf16>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<4x512xf16>) outs(%alloc : memref<4x512xf16>) {
-    ^bb0(%in: f16, %out: f16):
-      %0 = arith.mulf %in, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f16):
+          %0 = arith.mulf %in, %cst : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map2, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<1000xf32>) outs(%alloc : memref<4x1000xf16>) {
-    ^bb0(%in: f16, %in_0: f32, %out: f16):
-      %0 = arith.truncf %in_0 : f32 to f16
-      %1 = arith.addf %in, %0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg3] [1] [1] : memref<1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_2: f16, %out: f16):
+          %0 = arith.addf %in_2, %in : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
     return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %8 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %8 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = arith.cmpi ugt, %2, %c1 : index
+        %6 = scf.if %5 -> (f16) {
+          %8 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %8 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %7 = arith.maximumf %4, %6 : f16
+        memref.store %7, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = affine.apply #map5(%arg2)
+        %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf16>
-    linalg.generic {indexing_maps = [#map1, #map3, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x1000xf16>, memref<4xf16>) outs(%alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>) {
-    ^bb0(%in: f16, %in_1: f16, %out: f16, %out_2: f16):
-      %0 = arith.subf %in, %in_1 : f16
-      %1 = math.exp %0 : f16
-      linalg.yield %0, %1 : f16, f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg0[%arg2] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %arg1[%arg2, %arg3] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_2: f16, %out: f16):
+          %0 = arith.subf %in_2, %in : f16
+          linalg.yield %0 : f16
+        }
+      }
     }
-    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
+    return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = affine.min #map1(%arg2)
+        %1 = affine.min #map2(%arg2)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_8 = memref.subview %expand_shape[0, %0] [1, %2] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4 = scf.if %3 -> (f16) {
+          %11 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %11 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %5 = math.exp %4 : f16
+        %6 = arith.addf %5, %cst : f16
+        %7 = arith.cmpi ugt, %2, %c1 : index
+        %8 = scf.if %7 -> (f16) {
+          %11 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %11 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %9 = math.exp %8 : f16
+        %10 = arith.addf %6, %9 : f16
+        memref.store %10, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = affine.apply #map4(%arg2)
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = affine.apply #map5(%arg2)
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+      ^bb0(%in: f16, %out: f16):
+        %0 = math.log %in : f16
+        linalg.yield %0 : f16
+      }
+    }
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf32>
-    %alloc_1 = memref.alloc() : memref<4x1000xf32>
-    linalg.generic {indexing_maps = [#map1, #map1, #map3, #map3, #map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3, %arg1, %arg0, %arg2, %arg4 : memref<4x1000xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4xf16>, memref<4x1000xf32>) outs(%alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) {
-    ^bb0(%in: f16, %in_2: f16, %in_3: f16, %in_4: f16, %in_5: f32, %out: f16, %out_6: f32, %out_7: f32):
-      %0 = math.log %in_3 : f16
-      %1 = arith.subf %in_2, %0 : f16
-      %2 = math.exp %1 : f16
-      %3 = arith.mulf %2, %in_4 : f16
-      %4 = arith.subf %in, %3 : f16
-      %5 = arith.extf %1 : f16 to f32
-      %6 = arith.mulf %5, %in_5 : f32
-      %7 = arith.extf %4 : f16 to f32
-      linalg.yield %4, %6, %7 : f16, f32, f32
+    %alloc_0 = memref.alloc() : memref<4x1000xf16>
+    scf.for %arg4 = %c0 to %c4 step %c1 {
+      scf.for %arg5 = %c0 to %c1000 step %c1 {
+        %subview = memref.subview %arg2[%arg4] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_1 = memref.subview %alloc_0[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_2 = memref.subview %alloc[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_3 = memref.subview %arg0[%arg4] [1] [1] : memref<4xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_4 = memref.subview %arg1[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_5 = memref.subview %arg3[%arg4, %arg5] [1, 1] [1, 1] : memref<4x1000xf16> to memref<f16, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map, #map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_3, %subview_4, %subview_5 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_2, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %in_6: f16, %in_7: f16, %in_8: f16, %out: f16, %out_9: f16):
+          %0 = arith.subf %in_7, %in_6 : f16
+          %1 = math.exp %0 : f16
+          %2 = arith.mulf %1, %in : f16
+          %3 = arith.subf %in_8, %2 : f16
+          linalg.yield %0, %3 : f16, f16
+        }
+      }
     }
-    return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>
+    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
   }
-  func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.900000e+01 : f16
     %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map4, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1, %arg0 : memref<4x512x7x7xi1>, memref<4x512xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_1: f16, %out: f16):
-      %0 = arith.divf %in_1, %cst : f16
-      %1 = arith.select %in, %0, %cst_0 : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3] [1, 1] [1, 1] : memref<4x512xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_1 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: i1, %out: f16):
+              %0 = arith.divf %in, %cst : f16
+              %1 = arith.select %in_3, %0, %cst_0 : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c512 step %c1 {
+        scf.for %arg4 = %c0 to %c7 step %c1 {
+          scf.for %arg5 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c512 step %c1 {
+        scf.for %arg5 = %c0 to %c7 step %c1 {
+          scf.for %arg6 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x512x7x7xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) outs(%alloc : memref<4x512x7x7xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x512x7x7xf16>
-  }
-  func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c256 step %c1 {
+        scf.for %arg5 = %c0 to %c14 step %c1 {
+          scf.for %arg6 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c14 = arith.constant 14 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c256 step %c1 {
+        scf.for %arg4 = %c0 to %c14 step %c1 {
+          scf.for %arg5 = %c0 to %c14 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x256x14x14xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) outs(%alloc : memref<4x256x14x14xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c128 step %c1 {
+        scf.for %arg5 = %c0 to %c28 step %c1 {
+          scf.for %arg6 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c28 = arith.constant 28 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c128 step %c1 {
+        scf.for %arg4 = %c0 to %c28 step %c1 {
+          scf.for %arg5 = %c0 to %c28 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x128x28x28xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) outs(%alloc : memref<4x128x28x28xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg3 = %c0 to %c4 step %c1 {
+      scf.for %arg4 = %c0 to %c64 step %c1 {
+        scf.for %arg5 = %c0 to %c56 step %c1 {
+          scf.for %arg6 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_2 = memref.subview %arg2[%arg3, %arg4, %arg5, %arg6] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = []} ins(%subview, %subview_1, %subview_2 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>, memref<i1, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_3: f16, %in_4: i1, %out: f16):
+              %0 = arith.addf %in, %in_3 : f16
+              %1 = arith.select %in_4, %0, %cst : f16
+              linalg.yield %1 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+  func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c56 = arith.constant 56 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %in_1: f16, %out: f16):
-      %0 = arith.addf %in_0, %in_1 : f16
-      %1 = arith.select %in, %0, %cst : f16
-      linalg.yield %1 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c56 step %c1 {
+          scf.for %arg5 = %c0 to %c56 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x56x56xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<f16, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %in_2: f16, %out: f16):
+              %0 = arith.addf %in, %in_2 : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) outs(%alloc : memref<4x64x56x56xf16>) {
-    ^bb0(%in: f16, %in_0: f16, %out: f16):
-      %0 = arith.addf %in, %in_0 : f16
-      linalg.yield %0 : f16
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c112 = arith.constant 112 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
-    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) outs(%alloc : memref<4x64x112x112xf16>) {
-    ^bb0(%in: i1, %in_0: f16, %out: f16):
-      %0 = arith.select %in, %in_0, %cst : f16
-      linalg.yield %0 : f16
+    scf.for %arg2 = %c0 to %c4 step %c1 {
+      scf.for %arg3 = %c0 to %c64 step %c1 {
+        scf.for %arg4 = %c0 to %c112 step %c1 {
+          scf.for %arg5 = %c0 to %c112 step %c1 {
+            %subview = memref.subview %arg0[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xi1> to memref<i1, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_1 = memref.subview %arg1[%arg2, %arg3, %arg4, %arg5] [1, 1, 1, 1] [1, 1, 1, 1] : memref<4x64x112x112xf16> to memref<f16, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = []} ins(%subview, %subview_1 : memref<i1, strided<[], offset: ?>>, memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f16, strided<[], offset: ?>>) {
+            ^bb0(%in: i1, %in_2: f16, %out: f16):
+              %0 = arith.select %in, %in_2, %cst : f16
+              linalg.yield %0 : f16
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<4x64x112x112xf16>
   }
-  func.func private @Unknown141(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref<f32> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<f32>
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16>
+    %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32>
+    %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16>
+    %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32>
+    %alloc_3 = memref.alloc() : memref<32xf32>
+    scf.forall (%arg2) in (32) {
+      %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (128) {
+        %0 = affine.min #map8(%arg3)
+        %1 = affine.min #map9(%arg3)
+        %2 = affine.apply #map3(%1, %0)
+        %subview_13 = memref.subview %expand_shape_4[0, %0] [1, %2] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %subview_15 = memref.subview %expand_shape_6[0, %0] [1, %2] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+        %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+        %3 = arith.cmpi ugt, %2, %c0 : index
+        %4:2 = scf.if %3 -> (f16, f32) {
+          %8 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          %9 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+          scf.yield %8, %9 : f16, f32
+        } else {
+          scf.yield %cst_0, %cst : f16, f32
+        }
+        %5 = arith.extf %4#0 : f16 to f32
+        %6 = arith.mulf %5, %4#1 : f32
+        %7 = arith.addf %6, %cst : f32
+        memref.store %7, %alloca[%arg3] : memref<128xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (64) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc_3[%arg2] : memref<32xf32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    scf.forall (%arg2) in (1) {
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = affine.apply #map10(%arg2)[%arg3]
+        %1 = memref.load %alloc_3[%0] : memref<32xf32>
+        %2 = arith.addf %1, %cst : f32
+        memref.store %2, %alloca[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = affine.apply #map4(%arg3)
+        %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f32
+        %3 = affine.apply #map5(%arg3)
+        %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc[] : memref<f32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<f32>
+  }
+  func.func private @Unknown148(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.000000e+00 : f32
     %alloc = memref.alloc() : memref<f32>
-    linalg.generic {indexing_maps = [#map5, #map5], iterator_types = []} ins(%arg0 : memref<f32>) outs(%alloc : memref<f32>) {
+    linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%arg0 : memref<f32>) outs(%alloc : memref<f32>) {
     ^bb0(%in: f32, %out: f32):
       %0 = arith.negf %in : f32
       %1 = arith.divf %0, %cst : f32
@@ -679,202 +1634,335 @@ module @IrToMhlo.2452 {
     }
     return %alloc : memref<f32>
   }
-  func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c7 = arith.constant 7 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x3x7x7xf16>) outs(%alloc : memref<64x3x7x7xf32>) attrs =  {xla_shape = "f32[64,3,7,7]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c3 step %c1 {
+        scf.for %arg3 = %c0 to %c7 step %c1 {
+          scf.for %arg4 = %c0 to %c7 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x3x7x7xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<64x64x3x3xf16>) outs(%alloc : memref<64x64x3x3xf32>) attrs =  {xla_shape = "f32[64,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c64 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<64x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x3x3xf16>) outs(%alloc : memref<128x64x3x3xf32>) attrs =  {xla_shape = "f32[128,64,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x64x1x1xf16>) outs(%alloc : memref<128x64x1x1xf32>) attrs =  {xla_shape = "f32[128,64,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c128 step %c1 {
+      scf.for %arg2 = %c0 to %c64 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<128x64x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<128x128x3x3xf16>) outs(%alloc : memref<128x128x3x3xf32>) attrs =  {xla_shape = "f32[128,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x3x3xf16>) outs(%alloc : memref<256x128x3x3xf32>) attrs =  {xla_shape = "f32[256,128,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x128x1x1xf16>) outs(%alloc : memref<256x128x1x1xf32>) attrs =  {xla_shape = "f32[256,128,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c256 step %c1 {
+      scf.for %arg2 = %c0 to %c128 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<256x128x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<256x256x3x3xf16>) outs(%alloc : memref<256x256x3x3xf32>) attrs =  {xla_shape = "f32[256,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x3x3xf16>) outs(%alloc : memref<512x256x3x3xf32>) attrs =  {xla_shape = "f32[512,256,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        scf.for %arg3 = %c0 to %c3 step %c1 {
+          scf.for %arg4 = %c0 to %c3 step %c1 {
+            %subview = memref.subview %arg0[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf16> to memref<f16, strided<[], offset: ?>>
+            %subview_0 = memref.subview %alloc[%arg1, %arg2, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x512x3x3xf32> to memref<f32, strided<[], offset: ?>>
+            linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+            ^bb0(%in: f16, %out: f32):
+              %0 = arith.extf %in : f16 to f32
+              linalg.yield %0 : f32
+            }
+          }
+        }
+      }
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x256x1x1xf16>) outs(%alloc : memref<512x256x1x1xf32>) attrs =  {xla_shape = "f32[512,256,1,1]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c512 step %c1 {
+      scf.for %arg2 = %c0 to %c256 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2, 0, 0] [1, 1, 1, 1] [1, 1, 1, 1] : memref<512x256x1x1xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : memref<512x512x3x3xf16>) outs(%alloc : memref<512x512x3x3xf32>) attrs =  {xla_shape = "f32[512,512,3,3]{0,1,3,2}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : memref<1000x512xf16>) outs(%alloc : memref<1000x512xf32>) attrs =  {xla_shape = "f32[1000,512]{0,1}"} {
-    ^bb0(%in: f16, %out: f32):
-      %0 = arith.extf %in : f16 to f32
-      linalg.yield %0 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      scf.for %arg2 = %c0 to %c512 step %c1 {
+        %subview = memref.subview %arg0[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf16> to memref<f16, strided<[], offset: ?>>
+        %subview_0 = memref.subview %alloc[%arg1, %arg2] [1, 1] [1, 1] : memref<1000x512xf32> to memref<f32, strided<[], offset: ?>>
+        linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f16, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+        ^bb0(%in: f16, %out: f32):
+          %0 = arith.extf %in : f16 to f32
+          linalg.yield %0 : f32
+        }
+      }
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1000xf32>
+    scf.forall (%arg1) in (32) {
+      %0 = affine.min #map11(%arg1)
+      %1 = affine.apply #map12(%arg1)
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg2, %arg3) in (2, 32) {
+        %2 = affine.min #map13(%arg3, %arg1)
+        %3 = affine.min #map14(%arg3, %arg1)
+        %4 = affine.apply #map3(%3, %2)
+        %5 = arith.cmpi ugt, %4, %c0 : index
+        %6 = scf.if %5 -> (f16) {
+          %12 = affine.apply #map4(%arg2)
+          %13 = affine.apply #map10(%arg1)[%2]
+          %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16>
+          scf.yield %14 : f16
+        } else {
+          scf.yield %cst_0 : f16
+        }
+        %7 = arith.extf %6 : f16 to f32
+        %8 = arith.addf %7, %cst : f32
+        %9 = scf.if %5 -> (f16) {
+          %12 = affine.apply #map5(%arg2)
+          %13 = affine.apply #map10(%arg1)[%2]
+          %14 = memref.load %arg0[%12, %13] : memref<4x1000xf16>
+          scf.yield %14 : f16
+        } else {
+          scf.yield %cst_0 : f16
+        }
+        %10 = arith.extf %9 : f16 to f32
+        %11 = arith.addf %8, %10 : f32
+        memref.store %11, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      scf.forall (%arg2) in (32) {
+        %2 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %3 = arith.addf %2, %cst : f32
+        %4 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %3 : f32
+        memref.store %5, %alloca[%arg2] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %subview = memref.subview %alloca[0] [%0] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %alloc[%1] [%0] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+    %c0 = arith.constant 0 : index
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
     %alloc = memref.alloc() : memref<1000xf32>
-    linalg.generic {indexing_maps = [#map6, #map6], iterator_types = ["parallel"]} ins(%arg0 : memref<1000xf32>) outs(%alloc : memref<1000xf32>) {
-    ^bb0(%in: f32, %out: f32):
-      %0 = arith.truncf %in : f32 to f16
-      %1 = arith.extf %0 : f16 to f32
-      linalg.yield %1 : f32
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %subview = memref.subview %arg0[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      %subview_0 = memref.subview %alloc[%arg1] [1] [1] : memref<1000xf32> to memref<f32, strided<[], offset: ?>>
+      linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%subview : memref<f32, strided<[], offset: ?>>) outs(%subview_0 : memref<f32, strided<[], offset: ?>>) {
+      ^bb0(%in: f32, %out: f32):
+        %0 = arith.truncf %in : f32 to f16
+        %1 = arith.extf %0 : f16 to f32
+        linalg.yield %1 : f32
+      }
     }
     return %alloc : memref<1000xf32>
   }
@@ -886,344 +1974,340 @@ module @IrToMhlo.2452 {
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16>
     %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16>
     %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16>
     %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16>
     %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16>
     %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_1 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16>
+    %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16>
+    %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %alloc_1 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
     %alloc_2 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
+    byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_3 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_4 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_5 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_6 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_7 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_8 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_10 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    %alloc_10 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
     %alloc_11 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
     %alloc_12 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_13 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_14 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_15 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_16 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_17 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_18 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_19 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_20 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    %alloc_20 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
     %alloc_21 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
     %alloc_22 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_23 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_24 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_25 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_26 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_27 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_28 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_29 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_30 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    %alloc_30 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
     %alloc_31 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
     %alloc_32 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_33 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_34 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_35 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_36 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_37 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_38 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_39 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_40 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16>
+    %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16>
+    %alloc_40 = memref.alloc() : memref<4x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
+    %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16>
+    %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16>
+    %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16>
+    %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
     %alloc_41 = memref.alloc() : memref<4x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16>
-    %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16>
-    %alloc_42 = memref.alloc() : memref<4x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
-    %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16>
-    %alloc_43 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
-    %alloc_44 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>)
-    %alloc_45 = memref.alloc() : memref<4x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
-    %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_46 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_47 = memref.alloc() : memref<512xf32>
+    byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
+    %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_42 = memref.alloc() : memref<4x512x7x7xf16>
+    %alloc_43 = memref.alloc() : memref<512xf32>
+    %alloc_44 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_45 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_46 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_47 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_48 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_49 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_50 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_51 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_52 = memref.alloc() : memref<512xf32>
+    %alloc_49 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_50 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_51 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_52 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_53 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_54 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_55 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_56 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_57 = memref.alloc() : memref<512xf32>
+    %alloc_54 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_55 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_56 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_57 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_58 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_59 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_60 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_61 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_62 = memref.alloc() : memref<512xf32>
+    %alloc_59 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_60 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_61 = memref.alloc() : memref<512x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
+    %alloc_62 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_63 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_64 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_65 = memref.alloc() : memref<512x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
-    %alloc_66 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_67 = memref.alloc() : memref<512xf32>
-    %alloc_68 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_69 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
-    %alloc_70 = memref.alloc() : memref<512x256x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
-    %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_71 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_72 = memref.alloc() : memref<256xf32>
+    %alloc_64 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_65 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
+    %alloc_66 = memref.alloc() : memref<512x256x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
+    %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_67 = memref.alloc() : memref<4x256x14x14xf16>
+    %alloc_68 = memref.alloc() : memref<256xf32>
+    %alloc_69 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_70 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_71 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_72 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_73 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_74 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_75 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_76 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_77 = memref.alloc() : memref<256xf32>
+    %alloc_74 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_75 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_76 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_77 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_78 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_79 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_80 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_81 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_82 = memref.alloc() : memref<256xf32>
+    %alloc_79 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_80 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_81 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_82 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_83 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_84 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_85 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_86 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_87 = memref.alloc() : memref<256xf32>
+    %alloc_84 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_85 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_86 = memref.alloc() : memref<256x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
+    %alloc_87 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_88 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_89 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_90 = memref.alloc() : memref<256x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
-    %alloc_91 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_92 = memref.alloc() : memref<256xf32>
-    %alloc_93 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_94 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
-    %alloc_95 = memref.alloc() : memref<256x128x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
-    %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_96 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_97 = memref.alloc() : memref<128xf32>
+    %alloc_89 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_90 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
+    %alloc_91 = memref.alloc() : memref<256x128x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
+    %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_92 = memref.alloc() : memref<4x128x28x28xf16>
+    %alloc_93 = memref.alloc() : memref<128xf32>
+    %alloc_94 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_95 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_96 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_97 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_98 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_99 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_100 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_101 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_102 = memref.alloc() : memref<128xf32>
+    %alloc_99 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_100 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_101 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_102 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_103 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_104 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_105 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_106 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_107 = memref.alloc() : memref<128xf32>
+    %alloc_104 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_105 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_106 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_107 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_108 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_109 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_110 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_111 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_112 = memref.alloc() : memref<128xf32>
+    %alloc_109 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_110 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_111 = memref.alloc() : memref<128x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
+    %alloc_112 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_113 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_114 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_115 = memref.alloc() : memref<128x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
-    %alloc_116 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_117 = memref.alloc() : memref<128xf32>
-    %alloc_118 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_119 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
-    %alloc_120 = memref.alloc() : memref<128x64x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
-    %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_121 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_122 = memref.alloc() : memref<64xf32>
+    %alloc_114 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_115 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
+    %alloc_116 = memref.alloc() : memref<128x64x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
+    %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_117 = memref.alloc() : memref<4x64x56x56xf16>
+    %alloc_118 = memref.alloc() : memref<64xf32>
+    %alloc_119 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_120 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_121 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_122 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_123 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_124 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_125 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_126 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_127 = memref.alloc() : memref<64xf32>
+    %alloc_124 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_125 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_126 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_127 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_128 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_129 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_130 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_131 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_132 = memref.alloc() : memref<64xf32>
+    %alloc_129 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_130 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_131 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_132 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_133 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_134 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_135 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_136 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_137 = memref.alloc() : memref<64xf32>
-    %alloc_138 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_139 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_140 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_141 = memref.alloc() : memref<4x64x112x112xf16>
-    byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
-    %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
-    %alloc_142 = memref.alloc() : memref<4x64x112x112xf16>
-    %alloc_143 = memref.alloc() : memref<64xf32>
-    %alloc_144 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_145 = memref.alloc() : memref<64x3x7x7xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
-    %alloc_146 = memref.alloc() : memref<f32>
-    byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<f32>
-    %62 = call @Unknown141(%alloc_146) : (memref<f32>) -> memref<f32>
-    %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %alloc_147 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
-    %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %alloc_148 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32>
-    %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32>
-    return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
+    %alloc_134 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_135 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_136 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_137 = memref.alloc() : memref<4x64x112x112xf16>
+    byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
+    %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
+    %alloc_138 = memref.alloc() : memref<4x64x112x112xf16>
+    %alloc_139 = memref.alloc() : memref<64xf32>
+    %alloc_140 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_141 = memref.alloc() : memref<64x3x7x7xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
+    %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref<f32>
+    %69 = call @Unknown148(%68) : (memref<f32>) -> memref<f32>
+    %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
+    %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %alloc_142 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
+    %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32>
+    %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32>
+    return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir b/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir
index 121c21a6b..ad5504a74 100644
--- a/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/6_gpu_opt.mlir
@@ -4,2766 +4,1530 @@
 
 module @IrToMhlo.2452 {
   func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c602112 = arith.constant 602112 : index
-    %c1 = arith.constant 1 : index
     %c224 = arith.constant 224 : index
-    %c-1 = arith.constant -1 : index
     %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c602112 = arith.constant 602112 : index
     %alloc = memref.alloc() : memref<4x3x224x224xf16>
     scf.for %arg1 = %c0 to %c602112 step %c1 {
       %0 = arith.remsi %arg1, %c224 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c224 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c224 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c224 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c224 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c224 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c3 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c3 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c3 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x3x224x224xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x3x224x224xf16>
+      %1 = arith.divsi %arg1, %c224 : index
+      %2 = arith.remsi %1, %c224 : index
+      %3 = arith.divsi %1, %c224 : index
+      %4 = arith.remsi %3, %c3 : index
+      %5 = arith.divsi %3, %c3 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x3x224x224xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x3x224x224xf16>
     }
     return %alloc : memref<4x3x224x224xf16>
   }
   func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c9408 = arith.constant 9408 : index
-    %c1 = arith.constant 1 : index
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c9408 = arith.constant 9408 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
     scf.for %arg1 = %c0 to %c9408 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c3 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c3 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c3 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf16>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c3 : index
+      %5 = arith.divsi %3, %c3 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf16>
     }
     return %alloc : memref<64x3x7x7xf16>
   }
   func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-    }
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
     scf.for %arg1 = %c0 to %c36864 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf16>
     }
     return %alloc : memref<64x64x3x3xf16>
   }
   func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c8192 = arith.constant 8192 : index
-    %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
     scf.for %arg1 = %c0 to %c8192 step %c1 {
       %0 = arith.remsi %arg1, %c64 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c64 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c64 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf16>
+      %1 = arith.divsi %arg1, %c64 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf16>
     }
     return %alloc : memref<128x64x1x1xf16>
   }
   func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c73728 = arith.constant 73728 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c73728 = arith.constant 73728 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
     scf.for %arg1 = %c0 to %c73728 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf16>
     }
     return %alloc : memref<128x64x3x3xf16>
   }
   func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-    }
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
     scf.for %arg1 = %c0 to %c147456 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf16>
     }
     return %alloc : memref<128x128x3x3xf16>
   }
   func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c32768 = arith.constant 32768 : index
-    %c1 = arith.constant 1 : index
-    %c128 = arith.constant 128 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
     scf.for %arg1 = %c0 to %c32768 step %c1 {
       %0 = arith.remsi %arg1, %c128 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c128 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c128 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf16>
+      %1 = arith.divsi %arg1, %c128 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf16>
     }
     return %alloc : memref<256x128x1x1xf16>
   }
   func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c294912 = arith.constant 294912 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c294912 = arith.constant 294912 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
     scf.for %arg1 = %c0 to %c294912 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf16>
     }
     return %alloc : memref<256x128x3x3xf16>
   }
   func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-    }
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
     scf.for %arg1 = %c0 to %c589824 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf16>
     }
     return %alloc : memref<256x256x3x3xf16>
   }
   func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c131072 = arith.constant 131072 : index
-    %c1 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
     scf.for %arg1 = %c0 to %c131072 step %c1 {
       %0 = arith.remsi %arg1, %c256 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c256 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c256 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf16>
+      %1 = arith.divsi %arg1, %c256 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf16>
     }
     return %alloc : memref<512x256x1x1xf16>
   }
   func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c1179648 = arith.constant 1179648 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c1179648 = arith.constant 1179648 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
     scf.for %arg1 = %c0 to %c1179648 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf16>
     }
     return %alloc : memref<512x256x3x3xf16>
   }
   func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-    }
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
     scf.for %arg1 = %c0 to %c2359296 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-      %31 = arith.truncf %30 : f32 to f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf16>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf32>
+      %7 = arith.truncf %6 : f32 to f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf16>
     }
     return %alloc : memref<512x512x3x3xf16>
   }
   func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant -2.500000e-01 : f32
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
+    %cst = arith.constant -2.500000e-01 : f32
     %c4000 = arith.constant 4000 : index
-    %c1 = arith.constant 1 : index
-    %c1000 = arith.constant 1000 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
     scf.for %arg1 = %c0 to %c4000 step %c1 {
       %0 = arith.remsi %arg1, %c1000 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c1000 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c1000 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<4x1000xf32>
-      %11 = arith.mulf %10, %cst : f32
-      %12 = arith.truncf %11 : f32 to f16
-      memref.store %12, %alloc[%9, %3] : memref<4x1000xf16>
+      %1 = arith.divsi %arg1, %c1000 : index
+      %2 = memref.load %arg0[%1, %0] : memref<4x1000xf32>
+      %3 = arith.mulf %2, %cst : f32
+      %4 = arith.truncf %3 : f32 to f16
+      memref.store %4, %alloc[%1, %0] : memref<4x1000xf16>
     }
     return %alloc : memref<4x1000xf16>
   }
   func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byteir_elementwise_fusion__} {
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c512000 = arith.constant 512000 : index
-    %c1 = arith.constant 1 : index
-    %c512 = arith.constant 512 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
     scf.for %arg1 = %c0 to %c512000 step %c1 {
       %0 = arith.remsi %arg1, %c512 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c512 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c512 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<1000x512xf32>
-      %11 = arith.truncf %10 : f32 to f16
-      memref.store %11, %alloc[%9, %3] : memref<1000x512xf16>
+      %1 = arith.divsi %arg1, %c512 : index
+      %2 = memref.load %arg0[%1, %0] : memref<1000x512xf32>
+      %3 = arith.truncf %2 : f32 to f16
+      memref.store %3, %alloc[%1, %0] : memref<1000x512xf16>
     }
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
+  func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1000xf16>
+    scf.for %arg1 = %c0 to %c1000 step %c1 {
+      %0 = memref.load %arg0[%arg1] : memref<1000xf32>
+      %1 = arith.truncf %0 : f32 to f16
+      memref.store %1, %alloc[%arg1] : memref<1000xf16>
+    }
+    return %alloc : memref<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
-    %c3211264 = arith.constant 3211264 : index
     %c1 = arith.constant 1 : index
-    %c112 = arith.constant 112 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c2 = arith.constant 2 : index
+    %c512 = arith.constant 512 : index
     %c-1 = arith.constant -1 : index
+    %c-1024 = arith.constant -1024 : index
+    %c1000 = arith.constant 1000 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = arith.cmpi slt, %arg2, %c0 : index
+        %2 = arith.subi %c-1, %arg2 : index
+        %3 = arith.select %1, %2, %arg2 : index
+        %4 = arith.divsi %3, %c512 : index
+        %5 = arith.subi %c-1, %4 : index
+        %6 = arith.select %1, %5, %4 : index
+        %7 = arith.muli %6, %c-1024 : index
+        %8 = arith.addi %0, %7 : index
+        %9 = arith.cmpi slt, %8, %c1000 : index
+        %10 = arith.select %9, %8, %c1000 : index
+        %11 = arith.addi %8, %c2 : index
+        %12 = arith.cmpi slt, %11, %c1000 : index
+        %13 = arith.select %12, %11, %c1000 : index
+        %14 = arith.subi %13, %10 : index
+        %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %15 = arith.cmpi ugt, %14, %c0 : index
+        %16 = scf.if %15 -> (f16) {
+          %21 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %21 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %17 = arith.addf %16, %cst : f16
+        %18 = arith.cmpi ugt, %14, %c1 : index
+        %19 = scf.if %18 -> (f16) {
+          %21 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %21 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %20 = arith.addf %17, %19 : f16
+        memref.store %20, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c112 = arith.constant 112 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c3211264 = arith.constant 3211264 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
     %alloc_0 = memref.alloc() : memref<4x64x112x112xi1>
     scf.for %arg1 = %c0 to %c3211264 step %c1 {
       %0 = arith.remsi %arg1, %c112 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c112 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c112 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c112 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c112 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c112 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x112x112xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x112x112xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x112x112xi1>
+      %1 = arith.divsi %arg1, %c112 : index
+      %2 = arith.remsi %1, %c112 : index
+      %3 = arith.divsi %1, %c112 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x112x112xf16>
+      %7 = arith.maximumf %6, %cst : f16
+      %8 = arith.cmpf ogt, %7, %cst : f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x64x112x112xf16>
+      memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x64x112x112xi1>
     }
     return %alloc, %alloc_0 : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>
   }
-  func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    scf.for %arg1 = %c0 to %c802816 step %c1 {
-      %0 = arith.remsi %arg1, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
     %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
-    scf.for %arg2 = %c0 to %c802816 step %c1 {
-      %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
     scf.for %arg1 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg1, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
+      %1 = arith.divsi %arg1, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %7 = arith.maximumf %6, %cst : f16
+      %8 = arith.cmpf ogt, %7, %cst : f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x64x56x56xi1>
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c802816 = arith.constant 802816 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xi1>
     scf.for %arg2 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %8 = arith.addf %6, %7 : f16
+      %9 = arith.maximumf %8, %cst : f16
+      %10 = arith.cmpf ogt, %9, %cst : f16
+      memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x64x56x56xi1>
     }
     return %alloc, %alloc_0 : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c401408 = arith.constant 401408 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
     scf.for %arg1 = %c0 to %c401408 step %c1 {
       %0 = arith.remsi %arg1, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
+      %1 = arith.divsi %arg1, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %7 = arith.maximumf %6, %cst : f16
+      %8 = arith.cmpf ogt, %7, %cst : f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x128x28x28xi1>
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c401408 = arith.constant 401408 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
     scf.for %arg2 = %c0 to %c401408 step %c1 {
       %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
+      %1 = arith.divsi %arg2, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %8 = arith.addf %6, %7 : f16
+      %9 = arith.maximumf %8, %cst : f16
+      %10 = arith.cmpf ogt, %9, %cst : f16
+      memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x128x28x28xi1>
     }
     return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
+  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
     %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    scf.for %arg1 = %c0 to %c401408 step %c1 {
-      %0 = arith.remsi %arg1, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xi1>
-    scf.for %arg2 = %c0 to %c401408 step %c1 {
-      %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
     scf.for %arg1 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg1, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
+      %1 = arith.divsi %arg1, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %7 = arith.maximumf %6, %cst : f16
+      %8 = arith.cmpf ogt, %7, %cst : f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x256x14x14xi1>
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
     %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    scf.for %arg2 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
     %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
-    scf.for %arg1 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg1, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xi1>
     scf.for %arg2 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
+      %1 = arith.divsi %arg2, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %8 = arith.addf %6, %7 : f16
+      %9 = arith.maximumf %8, %cst : f16
+      %10 = arith.cmpf ogt, %9, %cst : f16
+      memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x256x14x14xi1>
     }
     return %alloc, %alloc_0 : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    scf.for %arg1 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
     %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
-    scf.for %arg2 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-    }
-    return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
     scf.for %arg1 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %31 = arith.maxnumf %30, %cst : f16
-      %32 = arith.cmpf ogt, %31, %cst : f16
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      memref.store %32, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %7 = arith.maximumf %6, %cst : f16
+      %8 = arith.cmpf ogt, %7, %cst : f16
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      memref.store %8, %alloc_0[%5, %4, %2, %0] : memref<4x512x7x7xi1>
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byteir_elementwise_fusion__} {
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c100352 = arith.constant 100352 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xi1>
     scf.for %arg2 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %32 = arith.addf %30, %31 : f16
-      %33 = arith.maxnumf %32, %cst : f16
-      %34 = arith.cmpf ogt, %33, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      memref.store %34, %alloc_0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %8 = arith.addf %6, %7 : f16
+      %9 = arith.maximumf %8, %cst : f16
+      %10 = arith.cmpf ogt, %9, %cst : f16
+      memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      memref.store %10, %alloc_0[%5, %4, %2, %0] : memref<4x512x7x7xi1>
     }
     return %alloc, %alloc_0 : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 2.040100e-02 : f16
+  func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
-    %c2048 = arith.constant 2048 : index
-    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c64 = arith.constant 64 : index
+    %c49 = arith.constant 49 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16>
+    %alloc = memref.alloc() : memref<2048xf16>
+    scf.forall (%arg1) in (2048) {
+      %subview = memref.subview %collapse_shape[%arg1, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape_0 = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = arith.remsi %arg2, %c64 : index
+        %1 = arith.cmpi slt, %0, %c0 : index
+        %2 = arith.addi %0, %c64 : index
+        %3 = arith.select %1, %2, %0 : index
+        %4 = arith.cmpi slt, %3, %c49 : index
+        %5 = arith.select %4, %3, %c49 : index
+        %6 = arith.addi %3, %c1 : index
+        %7 = arith.cmpi slt, %6, %c49 : index
+        %8 = arith.select %7, %6, %c49 : index
+        %9 = arith.subi %8, %5 : index
+        %subview_6 = memref.subview %expand_shape_0[0, %5] [1, %9] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_7 = memref.expand_shape %subview_6 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %10 = arith.cmpi ugt, %9, %c0 : index
+        %11 = scf.if %10 -> (f16) {
+          %13 = memref.load %expand_shape_7[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %13 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %12 = arith.addf %11, %cst : f16
+        memref.store %12, %alloca[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_1[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_1[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_2[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_2[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_3[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_3[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_4[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_5[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<2048xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16>
+    return %expand_shape : memref<4x512xf16>
+  }
+  func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byteir_elementwise_fusion__} {
     %c512 = arith.constant 512 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 2.040100e-02 : f16
+    %c2048 = arith.constant 2048 : index
     %alloc = memref.alloc() : memref<4x512xf16>
     scf.for %arg1 = %c0 to %c2048 step %c1 {
       %0 = arith.remsi %arg1, %c512 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c512 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c512 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<4x512xf16>
-      %11 = arith.mulf %10, %cst : f16
-      memref.store %11, %alloc[%9, %3] : memref<4x512xf16>
+      %1 = arith.divsi %arg1, %c512 : index
+      %2 = memref.load %arg0[%1, %0] : memref<4x512xf16>
+      %3 = arith.mulf %2, %cst : f16
+      memref.store %3, %alloc[%1, %0] : memref<4x512xf16>
     }
     return %alloc : memref<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c4000 = arith.constant 4000 : index
-    %c1 = arith.constant 1 : index
-    %c1000 = arith.constant 1000 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
     scf.for %arg2 = %c0 to %c4000 step %c1 {
       %0 = arith.remsi %arg2, %c1000 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c1000 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c1000 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg1[%9, %3] : memref<4x1000xf16>
-      %11 = memref.load %arg0[%3] : memref<1000xf32>
-      %12 = arith.truncf %11 : f32 to f16
-      %13 = arith.addf %10, %12 : f16
-      memref.store %13, %alloc[%9, %3] : memref<4x1000xf16>
+      %1 = arith.divsi %arg2, %c1000 : index
+      %2 = memref.load %arg0[%0] : memref<1000xf16>
+      %3 = memref.load %arg1[%1, %0] : memref<4x1000xf16>
+      %4 = arith.addf %3, %2 : f16
+      memref.store %4, %alloc[%1, %0] : memref<4x1000xf16>
     }
     return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
-    %c4000 = arith.constant 4000 : index
     %c1 = arith.constant 1 : index
-    %c1000 = arith.constant 1000 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c2 = arith.constant 2 : index
+    %c512 = arith.constant 512 : index
     %c-1 = arith.constant -1 : index
+    %c-1024 = arith.constant -1024 : index
+    %c1000 = arith.constant 1000 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = arith.cmpi slt, %arg2, %c0 : index
+        %2 = arith.subi %c-1, %arg2 : index
+        %3 = arith.select %1, %2, %arg2 : index
+        %4 = arith.divsi %3, %c512 : index
+        %5 = arith.subi %c-1, %4 : index
+        %6 = arith.select %1, %5, %4 : index
+        %7 = arith.muli %6, %c-1024 : index
+        %8 = arith.addi %0, %7 : index
+        %9 = arith.cmpi slt, %8, %c1000 : index
+        %10 = arith.select %9, %8, %c1000 : index
+        %11 = arith.addi %8, %c2 : index
+        %12 = arith.cmpi slt, %11, %c1000 : index
+        %13 = arith.select %12, %11, %c1000 : index
+        %14 = arith.subi %13, %10 : index
+        %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %15 = arith.cmpi ugt, %14, %c0 : index
+        %16 = scf.if %15 -> (f16) {
+          %20 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %20 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %17 = arith.cmpi ugt, %14, %c1 : index
+        %18 = scf.if %17 -> (f16) {
+          %20 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %20 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %19 = arith.maximumf %16, %18 : f16
+        memref.store %19, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_0[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_1[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_2[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_3[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_4[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_5[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_6[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addi %0, %c1 : index
+        %3 = memref.load %alloca_7[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %4 = arith.maximumf %3, %1 : f16
+        memref.store %4, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c4000 = arith.constant 4000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf16>
     scf.for %arg2 = %c0 to %c4000 step %c1 {
       %0 = arith.remsi %arg2, %c1000 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c1000 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c1000 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg1[%9, %3] : memref<4x1000xf16>
-      %11 = memref.load %arg0[%9] : memref<4xf16>
-      %12 = arith.subf %10, %11 : f16
-      %13 = math.exp %12 : f16
-      memref.store %12, %alloc[%9, %3] : memref<4x1000xf16>
-      memref.store %13, %alloc_0[%9, %3] : memref<4x1000xf16>
+      %1 = arith.divsi %arg2, %c1000 : index
+      %2 = memref.load %arg0[%1] : memref<4xf16>
+      %3 = memref.load %arg1[%1, %0] : memref<4x1000xf16>
+      %4 = arith.subf %3, %2 : f16
+      memref.store %4, %alloc[%1, %0] : memref<4x1000xf16>
     }
-    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
+    return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
-    %c4000 = arith.constant 4000 : index
     %c1 = arith.constant 1 : index
-    %c1000 = arith.constant 1000 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c2 = arith.constant 2 : index
+    %c512 = arith.constant 512 : index
     %c-1 = arith.constant -1 : index
+    %c-1024 = arith.constant -1024 : index
+    %c1000 = arith.constant 1000 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.forall (%arg1) in (4) {
+      %subview = memref.subview %arg0[%arg1, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (512) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = arith.cmpi slt, %arg2, %c0 : index
+        %2 = arith.subi %c-1, %arg2 : index
+        %3 = arith.select %1, %2, %arg2 : index
+        %4 = arith.divsi %3, %c512 : index
+        %5 = arith.subi %c-1, %4 : index
+        %6 = arith.select %1, %5, %4 : index
+        %7 = arith.muli %6, %c-1024 : index
+        %8 = arith.addi %0, %7 : index
+        %9 = arith.cmpi slt, %8, %c1000 : index
+        %10 = arith.select %9, %8, %c1000 : index
+        %11 = arith.addi %8, %c2 : index
+        %12 = arith.cmpi slt, %11, %c1000 : index
+        %13 = arith.select %12, %11, %c1000 : index
+        %14 = arith.subi %13, %10 : index
+        %subview_8 = memref.subview %expand_shape[0, %10] [1, %14] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_9 = memref.expand_shape %subview_8 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %15 = arith.cmpi ugt, %14, %c0 : index
+        %16 = scf.if %15 -> (f16) {
+          %23 = memref.load %expand_shape_9[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %23 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %17 = math.exp %16 : f16
+        %18 = arith.addf %17, %cst : f16
+        %19 = arith.cmpi ugt, %14, %c1 : index
+        %20 = scf.if %19 -> (f16) {
+          %23 = memref.load %expand_shape_9[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          scf.yield %23 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %21 = math.exp %20 : f16
+        %22 = arith.addf %18, %21 : f16
+        memref.store %22, %alloca[%arg2] : memref<512xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_0 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (256) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<512xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<512xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_0[%arg2] : memref<256xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_1 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (128) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_0[%0] : memref<256xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_0[%3] : memref<256xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_1[%arg2] : memref<128xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_2 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (64) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_1[%0] : memref<128xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_1[%3] : memref<128xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_2[%arg2] : memref<64xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_3 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (32) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_2[%0] : memref<64xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_2[%3] : memref<64xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_3[%arg2] : memref<32xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (16) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_3[%0] : memref<32xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_3[%3] : memref<32xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_4[%arg2] : memref<16xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (8) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<16xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_4[%3] : memref<16xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_5[%arg2] : memref<8xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (4) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<8xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_5[%3] : memref<8xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_6[%arg2] : memref<4xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      scf.forall (%arg2) in (2) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_6[%0] : memref<4xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_6[%3] : memref<4xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloca_7[%arg2] : memref<2xf16, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg2) in (1) {
+        %0 = arith.muli %arg2, %c2 : index
+        %1 = memref.load %alloca_7[%0] : memref<2xf16, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst : f16
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_7[%3] : memref<2xf16, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f16
+        memref.store %5, %alloc[%arg1] : memref<4xf16>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    scf.for %arg1 = %c0 to %c4 step %c1 {
+      %0 = memref.load %arg0[%arg1] : memref<4xf16>
+      %1 = math.log %0 : f16
+      memref.store %1, %alloc[%arg1] : memref<4xf16>
+    }
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byteir_elementwise_fusion__} {
+    %c1000 = arith.constant 1000 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c4000 = arith.constant 4000 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf32>
-    %alloc_1 = memref.alloc() : memref<4x1000xf32>
-    scf.for %arg5 = %c0 to %c4000 step %c1 {
-      %0 = arith.remsi %arg5, %c1000 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c1000 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg5, %c0 : index
-      %5 = arith.subi %c-1, %arg5 : index
-      %6 = arith.select %4, %5, %arg5 : index
-      %7 = arith.divsi %6, %c1000 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg3[%9, %3] : memref<4x1000xf16>
-      %11 = memref.load %arg1[%9, %3] : memref<4x1000xf16>
-      %12 = memref.load %arg0[%9] : memref<4xf16>
-      %13 = memref.load %arg2[%9] : memref<4xf16>
-      %14 = memref.load %arg4[%9, %3] : memref<4x1000xf32>
-      %15 = math.log %12 : f16
-      %16 = arith.subf %11, %15 : f16
-      %17 = math.exp %16 : f16
-      %18 = arith.mulf %17, %13 : f16
-      %19 = arith.subf %10, %18 : f16
-      %20 = arith.extf %16 : f16 to f32
-      %21 = arith.mulf %20, %14 : f32
-      %22 = arith.extf %19 : f16 to f32
-      memref.store %19, %alloc[%9, %3] : memref<4x1000xf16>
-      memref.store %21, %alloc_0[%9, %3] : memref<4x1000xf32>
-      memref.store %22, %alloc_1[%9, %3] : memref<4x1000xf32>
+    %alloc_0 = memref.alloc() : memref<4x1000xf16>
+    scf.for %arg4 = %c0 to %c4000 step %c1 {
+      %0 = arith.remsi %arg4, %c1000 : index
+      %1 = arith.divsi %arg4, %c1000 : index
+      %2 = memref.load %arg2[%1] : memref<4xf16>
+      %3 = memref.load %arg0[%1] : memref<4xf16>
+      %4 = memref.load %arg1[%1, %0] : memref<4x1000xf16>
+      %5 = memref.load %arg3[%1, %0] : memref<4x1000xf16>
+      %6 = arith.subf %4, %3 : f16
+      %7 = math.exp %6 : f16
+      %8 = arith.mulf %7, %2 : f16
+      %9 = arith.subf %5, %8 : f16
+      memref.store %6, %alloc[%1, %0] : memref<4x1000xf16>
+      memref.store %9, %alloc_0[%1, %0] : memref<4x1000xf16>
     }
-    return %alloc, %alloc_0, %alloc_1 : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>
+    return %alloc, %alloc_0 : memref<4x1000xf16>, memref<4x1000xf16>
   }
-  func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
+    %c7 = arith.constant 7 : index
+    %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
     %cst = arith.constant 0.000000e+00 : f16
     %cst_0 = arith.constant 4.900000e+01 : f16
-    %c0 = arith.constant 0 : index
     %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     scf.for %arg2 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-      %31 = memref.load %arg0[%29, %23] : memref<4x512xf16>
-      %32 = arith.divf %31, %cst_0 : f16
-      %33 = arith.select %30, %32, %cst : f16
-      memref.store %33, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4] : memref<4x512xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xi1>
+      %8 = arith.divf %6, %cst_0 : f16
+      %9 = arith.select %7, %8, %cst : f16
+      memref.store %9, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16>
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c100352 = arith.constant 100352 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     scf.for %arg2 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
+      %1 = arith.divsi %arg2, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xi1>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %8 = arith.select %6, %7, %cst : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16>
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c512 = arith.constant 512 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c100352 = arith.constant 100352 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
     scf.for %arg3 = %c0 to %c100352 step %c1 {
       %0 = arith.remsi %arg3, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
+      %1 = arith.divsi %arg3, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x512x7x7xf16>
+      %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x512x7x7xi1>
+      %9 = arith.addf %6, %7 : f16
+      %10 = arith.select %8, %9, %cst : f16
+      memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x512x7x7xf16>
     }
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c100352 = arith.constant 100352 : index
+  func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
+    %c14 = arith.constant 14 : index
+    %c256 = arith.constant 256 : index
     %c1 = arith.constant 1 : index
-    %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    scf.for %arg2 = %c0 to %c100352 step %c1 {
-      %0 = arith.remsi %arg2, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x512x7x7xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x512x7x7xf16>
-    }
-    return %alloc : memref<4x512x7x7xf16>
-  }
-  func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     scf.for %arg3 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg3, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
+      %1 = arith.divsi %arg3, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x256x14x14xi1>
+      %9 = arith.addf %6, %7 : f16
+      %10 = arith.select %8, %9, %cst : f16
+      memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16>
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
     %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    scf.for %arg2 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c200704 = arith.constant 200704 : index
     %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    scf.for %arg3 = %c0 to %c200704 step %c1 {
-      %0 = arith.remsi %arg3, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-    }
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c200704 = arith.constant 200704 : index
-    %c1 = arith.constant 1 : index
-    %c14 = arith.constant 14 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
     scf.for %arg2 = %c0 to %c200704 step %c1 {
       %0 = arith.remsi %arg2, %c14 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c14 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c14 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c14 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c14 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c14 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x256x14x14xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x256x14x14xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x256x14x14xf16>
+      %1 = arith.divsi %arg2, %c14 : index
+      %2 = arith.remsi %1, %c14 : index
+      %3 = arith.divsi %1, %c14 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x256x14x14xi1>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x256x14x14xf16>
+      %8 = arith.select %6, %7, %cst : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x256x14x14xf16>
     }
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    scf.for %arg3 = %c0 to %c401408 step %c1 {
-      %0 = arith.remsi %arg3, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
     %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    scf.for %arg2 = %c0 to %c401408 step %c1 {
-      %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-    }
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
-    %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     scf.for %arg3 = %c0 to %c401408 step %c1 {
       %0 = arith.remsi %arg3, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
+      %1 = arith.divsi %arg3, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x128x28x28xi1>
+      %9 = arith.addf %6, %7 : f16
+      %10 = arith.select %8, %9, %cst : f16
+      memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16>
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c401408 = arith.constant 401408 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byteir_elementwise_fusion__} {
     %c28 = arith.constant 28 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c401408 = arith.constant 401408 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
     scf.for %arg2 = %c0 to %c401408 step %c1 {
       %0 = arith.remsi %arg2, %c28 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c28 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c28 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c28 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c28 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c28 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x128x28x28xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x128x28x28xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x128x28x28xf16>
+      %1 = arith.divsi %arg2, %c28 : index
+      %2 = arith.remsi %1, %c28 : index
+      %3 = arith.divsi %1, %c28 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x128x28x28xi1>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x128x28x28xf16>
+      %8 = arith.select %6, %7, %cst : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x128x28x28xf16>
     }
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    scf.for %arg3 = %c0 to %c802816 step %c1 {
-      %0 = arith.remsi %arg3, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
     %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    scf.for %arg2 = %c0 to %c802816 step %c1 {
-      %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-    }
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
     %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
-    %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     scf.for %arg3 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg3, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg3, %c0 : index
-      %5 = arith.subi %c-1, %arg3 : index
-      %6 = arith.select %4, %5, %arg3 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg2[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-      %31 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %33 = arith.addf %31, %32 : f16
-      %34 = arith.select %30, %33, %cst : f16
-      memref.store %34, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
+      %1 = arith.divsi %arg3, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %8 = memref.load %arg2[%5, %4, %2, %0] : memref<4x64x56x56xi1>
+      %9 = arith.addf %6, %7 : f16
+      %10 = arith.select %8, %9, %cst : f16
+      memref.store %10, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16>
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c802816 = arith.constant 802816 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     scf.for %arg2 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xi1>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %8 = arith.select %6, %7, %cst : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16>
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c802816 = arith.constant 802816 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byteir_elementwise_fusion__} {
     %c56 = arith.constant 56 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c802816 = arith.constant 802816 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
     scf.for %arg2 = %c0 to %c802816 step %c1 {
       %0 = arith.remsi %arg2, %c56 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c56 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c56 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c56 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c56 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c56 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x56x56xf16>
-      %32 = arith.addf %30, %31 : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x56x56xf16>
+      %1 = arith.divsi %arg2, %c56 : index
+      %2 = arith.remsi %1, %c56 : index
+      %3 = arith.divsi %1, %c56 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x56x56xf16>
+      %8 = arith.addf %6, %7 : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x56x56xf16>
     }
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
-    %cst = arith.constant 0.000000e+00 : f16
-    %c0 = arith.constant 0 : index
-    %c3211264 = arith.constant 3211264 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byteir_elementwise_fusion__} {
     %c112 = arith.constant 112 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c3211264 = arith.constant 3211264 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
     scf.for %arg2 = %c0 to %c3211264 step %c1 {
       %0 = arith.remsi %arg2, %c112 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c112 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg2, %c0 : index
-      %5 = arith.subi %c-1, %arg2 : index
-      %6 = arith.select %4, %5, %arg2 : index
-      %7 = arith.divsi %6, %c112 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c112 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c112 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c112 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<4x64x112x112xi1>
-      %31 = memref.load %arg1[%29, %23, %13, %3] : memref<4x64x112x112xf16>
-      %32 = arith.select %30, %31, %cst : f16
-      memref.store %32, %alloc[%29, %23, %13, %3] : memref<4x64x112x112xf16>
+      %1 = arith.divsi %arg2, %c112 : index
+      %2 = arith.remsi %1, %c112 : index
+      %3 = arith.divsi %1, %c112 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<4x64x112x112xi1>
+      %7 = memref.load %arg1[%5, %4, %2, %0] : memref<4x64x112x112xf16>
+      %8 = arith.select %6, %7, %cst : f16
+      memref.store %8, %alloc[%5, %4, %2, %0] : memref<4x64x112x112xf16>
     }
     return %alloc : memref<4x64x112x112xf16>
   }
-  func.func private @Unknown141(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref<f32> attributes {__byteir_reduction_fusion__} {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %c128 = arith.constant 128 : index
+    %c125 = arith.constant 125 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c32 = arith.constant 32 : index
+    %alloc = memref.alloc() : memref<f32>
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16>
+    %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32>
+    %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16>
+    %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32>
+    %alloc_3 = memref.alloc() : memref<32xf32>
+    scf.forall (%arg2) in (32) {
+      %subview = memref.subview %expand_shape[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[%arg2, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (128) {
+        %0 = arith.remsi %arg3, %c128 : index
+        %1 = arith.cmpi slt, %0, %c0 : index
+        %2 = arith.addi %0, %c128 : index
+        %3 = arith.select %1, %2, %0 : index
+        %4 = arith.cmpi slt, %3, %c125 : index
+        %5 = arith.select %4, %3, %c125 : index
+        %6 = arith.addi %3, %c1 : index
+        %7 = arith.cmpi slt, %6, %c125 : index
+        %8 = arith.select %7, %6, %c125 : index
+        %9 = arith.subi %8, %5 : index
+        %subview_13 = memref.subview %expand_shape_4[0, %5] [1, %9] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+        %expand_shape_14 = memref.expand_shape %subview_13 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %subview_15 = memref.subview %expand_shape_6[0, %5] [1, %9] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+        %expand_shape_16 = memref.expand_shape %subview_15 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+        %10 = arith.cmpi ugt, %9, %c0 : index
+        %11:2 = scf.if %10 -> (f16, f32) {
+          %15 = memref.load %expand_shape_14[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+          %16 = memref.load %expand_shape_16[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+          scf.yield %15, %16 : f16, f32
+        } else {
+          scf.yield %cst, %cst_0 : f16, f32
+        }
+        %12 = arith.extf %11#0 : f16 to f32
+        %13 = arith.mulf %12, %11#1 : f32
+        %14 = arith.addf %13, %cst_0 : f32
+        memref.store %14, %alloca[%arg3] : memref<128xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (64) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<128xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<128xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<64xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_7[%0] : memref<64xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_7[%3] : memref<64xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_8[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_8[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_8[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_9[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_9[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_9[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_10[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_10[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_10[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_11[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_11[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_11[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_12[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_12[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_12[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc_3[%arg2] : memref<32xf32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    scf.forall (%arg2) in (1) {
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (32) {
+        %0 = arith.muli %arg2, %c32 : index
+        %1 = arith.addi %0, %arg3 : index
+        %2 = memref.load %alloc_3[%1] : memref<32xf32>
+        %3 = arith.addf %2, %cst_0 : f32
+        memref.store %3, %alloca[%arg3] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_4 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (16) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca[%0] : memref<32xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca[%3] : memref<32xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_4[%arg3] : memref<16xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_5 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (8) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_4[%0] : memref<16xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_4[%3] : memref<16xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_5[%arg3] : memref<8xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_6 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (4) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_5[%0] : memref<8xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_5[%3] : memref<8xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_6[%arg3] : memref<4xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %alloca_7 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg3) in (2) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_6[%0] : memref<4xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_6[%3] : memref<4xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloca_7[%arg3] : memref<2xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      scf.forall (%arg3) in (1) {
+        %0 = arith.muli %arg3, %c2 : index
+        %1 = memref.load %alloca_7[%0] : memref<2xf32, #gpu.address_space<workgroup>>
+        %2 = arith.addf %1, %cst_0 : f32
+        %3 = arith.addi %0, %c1 : index
+        %4 = memref.load %alloca_7[%3] : memref<2xf32, #gpu.address_space<workgroup>>
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %alloc[] : memref<f32>
+      } {mapping = [#gpu.thread<x>]}
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<f32>
+  }
+  func.func private @Unknown148(%arg0: memref<f32>) -> memref<f32> attributes {__byteir_elementwise_fusion__} {
     %cst = arith.constant 4.000000e+00 : f32
     %alloc = memref.alloc() : memref<f32>
     %0 = memref.load %arg0[] : memref<f32>
@@ -2772,871 +1536,292 @@ module @IrToMhlo.2452 {
     memref.store %2, %alloc[] : memref<f32>
     return %alloc : memref<f32>
   }
-  func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c9408 = arith.constant 9408 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byteir_elementwise_fusion__} {
     %c7 = arith.constant 7 : index
-    %c-1 = arith.constant -1 : index
     %c3 = arith.constant 3 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %c9408 = arith.constant 9408 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
     scf.for %arg1 = %c0 to %c9408 step %c1 {
       %0 = arith.remsi %arg1, %c7 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c7 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c7 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c7 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c7 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c7 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c3 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c3 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c3 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x3x7x7xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x3x7x7xf32>
+      %1 = arith.divsi %arg1, %c7 : index
+      %2 = arith.remsi %1, %c7 : index
+      %3 = arith.divsi %1, %c7 : index
+      %4 = arith.remsi %3, %c3 : index
+      %5 = arith.divsi %3, %c3 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x3x7x7xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x3x7x7xf32>
     }
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
     scf.for %arg1 = %c0 to %c36864 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<64x64x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<64x64x3x3xf32>
     }
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c36864 = arith.constant 36864 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c64 = arith.constant 64 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    scf.for %arg1 = %c0 to %c36864 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<64x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<64x64x3x3xf32>
-    }
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byteir_elementwise_fusion__} {
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c73728 = arith.constant 73728 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c64 = arith.constant 64 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
     scf.for %arg1 = %c0 to %c73728 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c64 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c64 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c64 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x64x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x64x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c64 : index
+      %5 = arith.divsi %3, %c64 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x64x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x64x3x3xf32>
     }
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c128 = arith.constant 128 : index
+    %c0 = arith.constant 0 : index
+    %c147456 = arith.constant 147456 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
     scf.for %arg1 = %c0 to %c147456 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<128x128x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<128x128x3x3xf32>
     }
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c64 = arith.constant 64 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c8192 = arith.constant 8192 : index
-    %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
     scf.for %arg1 = %c0 to %c8192 step %c1 {
       %0 = arith.remsi %arg1, %c64 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c64 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c64 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<128x64x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<128x64x1x1xf32>
+      %1 = arith.divsi %arg1, %c64 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<128x64x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<128x64x1x1xf32>
     }
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c147456 = arith.constant 147456 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    scf.for %arg1 = %c0 to %c147456 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<128x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<128x128x3x3xf32>
-    }
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c294912 = arith.constant 294912 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c128 = arith.constant 128 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
     scf.for %arg1 = %c0 to %c294912 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c128 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c128 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c128 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x128x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x128x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c128 : index
+      %5 = arith.divsi %3, %c128 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x128x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x128x3x3xf32>
     }
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c256 = arith.constant 256 : index
+    %c0 = arith.constant 0 : index
+    %c589824 = arith.constant 589824 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
     scf.for %arg1 = %c0 to %c589824 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<256x256x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<256x256x3x3xf32>
     }
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c32768 = arith.constant 32768 : index
-    %c1 = arith.constant 1 : index
-    %c128 = arith.constant 128 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
     scf.for %arg1 = %c0 to %c32768 step %c1 {
       %0 = arith.remsi %arg1, %c128 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c128 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c128 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<256x128x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<256x128x1x1xf32>
+      %1 = arith.divsi %arg1, %c128 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<256x128x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<256x128x1x1xf32>
     }
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
     %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c589824 = arith.constant 589824 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    scf.for %arg1 = %c0 to %c589824 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<256x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<256x256x3x3xf32>
-    }
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c1179648 = arith.constant 1179648 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
     scf.for %arg1 = %c0 to %c1179648 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c256 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c256 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c256 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x256x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x256x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c256 : index
+      %5 = arith.divsi %3, %c256 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x256x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x256x3x3xf32>
     }
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
+  func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
     %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+    %c1 = arith.constant 1 : index
     %c512 = arith.constant 512 : index
+    %c0 = arith.constant 0 : index
+    %c2359296 = arith.constant 2359296 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
     scf.for %arg1 = %c0 to %c2359296 step %c1 {
       %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
+      %1 = arith.divsi %arg1, %c3 : index
+      %2 = arith.remsi %1, %c3 : index
+      %3 = arith.divsi %1, %c3 : index
+      %4 = arith.remsi %3, %c512 : index
+      %5 = arith.divsi %3, %c512 : index
+      %6 = memref.load %arg0[%5, %4, %2, %0] : memref<512x512x3x3xf16>
+      %7 = arith.extf %6 : f16 to f32
+      memref.store %7, %alloc[%5, %4, %2, %0] : memref<512x512x3x3xf32>
     }
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byteir_elementwise_fusion__} {
+    %c256 = arith.constant 256 : index
+    %c1 = arith.constant 1 : index
     %c0 = arith.constant 0 : index
     %c131072 = arith.constant 131072 : index
-    %c1 = arith.constant 1 : index
-    %c256 = arith.constant 256 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
     scf.for %arg1 = %c0 to %c131072 step %c1 {
       %0 = arith.remsi %arg1, %c256 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c256 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c256 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3, %c0, %c0] : memref<512x256x1x1xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3, %c0, %c0] : memref<512x256x1x1xf32>
+      %1 = arith.divsi %arg1, %c256 : index
+      %2 = memref.load %arg0[%1, %0, %c0, %c0] : memref<512x256x1x1xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0, %c0, %c0] : memref<512x256x1x1xf32>
     }
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
-    %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
+  func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
     %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byteir_elementwise_fusion__} {
-    %c0 = arith.constant 0 : index
-    %c2359296 = arith.constant 2359296 : index
     %c1 = arith.constant 1 : index
-    %c3 = arith.constant 3 : index
-    %c-1 = arith.constant -1 : index
-    %c512 = arith.constant 512 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    scf.for %arg1 = %c0 to %c2359296 step %c1 {
-      %0 = arith.remsi %arg1, %c3 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c3 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c3 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = arith.remsi %9, %c3 : index
-      %11 = arith.cmpi slt, %10, %c0 : index
-      %12 = arith.addi %10, %c3 : index
-      %13 = arith.select %11, %12, %10 : index
-      %14 = arith.cmpi slt, %9, %c0 : index
-      %15 = arith.subi %c-1, %9 : index
-      %16 = arith.select %14, %15, %9 : index
-      %17 = arith.divsi %16, %c3 : index
-      %18 = arith.subi %c-1, %17 : index
-      %19 = arith.select %14, %18, %17 : index
-      %20 = arith.remsi %19, %c512 : index
-      %21 = arith.cmpi slt, %20, %c0 : index
-      %22 = arith.addi %20, %c512 : index
-      %23 = arith.select %21, %22, %20 : index
-      %24 = arith.cmpi slt, %19, %c0 : index
-      %25 = arith.subi %c-1, %19 : index
-      %26 = arith.select %24, %25, %19 : index
-      %27 = arith.divsi %26, %c512 : index
-      %28 = arith.subi %c-1, %27 : index
-      %29 = arith.select %24, %28, %27 : index
-      %30 = memref.load %arg0[%29, %23, %13, %3] : memref<512x512x3x3xf16>
-      %31 = arith.extf %30 : f16 to f32
-      memref.store %31, %alloc[%29, %23, %13, %3] : memref<512x512x3x3xf32>
-    }
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byteir_elementwise_fusion__} {
     %c0 = arith.constant 0 : index
     %c512000 = arith.constant 512000 : index
-    %c1 = arith.constant 1 : index
-    %c512 = arith.constant 512 : index
-    %c-1 = arith.constant -1 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
     scf.for %arg1 = %c0 to %c512000 step %c1 {
       %0 = arith.remsi %arg1, %c512 : index
-      %1 = arith.cmpi slt, %0, %c0 : index
-      %2 = arith.addi %0, %c512 : index
-      %3 = arith.select %1, %2, %0 : index
-      %4 = arith.cmpi slt, %arg1, %c0 : index
-      %5 = arith.subi %c-1, %arg1 : index
-      %6 = arith.select %4, %5, %arg1 : index
-      %7 = arith.divsi %6, %c512 : index
-      %8 = arith.subi %c-1, %7 : index
-      %9 = arith.select %4, %8, %7 : index
-      %10 = memref.load %arg0[%9, %3] : memref<1000x512xf16>
-      %11 = arith.extf %10 : f16 to f32
-      memref.store %11, %alloc[%9, %3] : memref<1000x512xf32>
+      %1 = arith.divsi %arg1, %c512 : index
+      %2 = memref.load %arg0[%1, %0] : memref<1000x512xf16>
+      %3 = arith.extf %2 : f16 to f32
+      memref.store %3, %alloc[%1, %0] : memref<1000x512xf32>
     }
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
+  func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} {
     %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %c-32 = arith.constant -32 : index
     %c1000 = arith.constant 1000 : index
+    %c32 = arith.constant 32 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<1000xf32>
+    scf.forall (%arg1) in (32) {
+      %0 = arith.muli %arg1, %c-32 : index
+      %1 = arith.addi %0, %c1000 : index
+      %2 = arith.cmpi slt, %1, %c32 : index
+      %3 = arith.select %2, %1, %c32 : index
+      %4 = arith.muli %arg1, %c32 : index
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      scf.forall (%arg2, %arg3) in (2, 32) {
+        %5 = arith.cmpi slt, %3, %arg3 : index
+        %6 = arith.select %5, %3, %arg3 : index
+        %7 = arith.addi %arg3, %c1 : index
+        %8 = arith.cmpi slt, %3, %7 : index
+        %9 = arith.select %8, %3, %7 : index
+        %10 = arith.subi %9, %6 : index
+        %11 = arith.cmpi ugt, %10, %c0 : index
+        %12 = scf.if %11 -> (f16) {
+          %18 = arith.muli %arg2, %c2 : index
+          %19 = arith.addi %4, %6 : index
+          %20 = memref.load %arg0[%18, %19] : memref<4x1000xf16>
+          scf.yield %20 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %13 = arith.extf %12 : f16 to f32
+        %14 = arith.addf %13, %cst_0 : f32
+        %15 = scf.if %11 -> (f16) {
+          %18 = arith.muli %arg2, %c2 : index
+          %19 = arith.addi %18, %c1 : index
+          %20 = arith.addi %4, %6 : index
+          %21 = memref.load %arg0[%19, %20] : memref<4x1000xf16>
+          scf.yield %21 : f16
+        } else {
+          scf.yield %cst : f16
+        }
+        %16 = arith.extf %15 : f16 to f32
+        %17 = arith.addf %14, %16 : f32
+        memref.store %17, %alloca_1[%arg2, %arg3] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      scf.forall (%arg2) in (32) {
+        %5 = memref.load %alloca_1[%c0, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %6 = arith.addf %5, %cst_0 : f32
+        %7 = memref.load %alloca_1[%c1, %arg2] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %8 = arith.addf %7, %6 : f32
+        memref.store %8, %alloca[%arg2] : memref<32xf32, #gpu.address_space<workgroup>>
+      } {mapping = [#gpu.thread<x>]}
+      %subview = memref.subview %alloca[0] [%3] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %alloc[%4] [%3] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
+    } {mapping = [#gpu.block<x>]}
+    return %alloc : memref<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byteir_elementwise_fusion__} {
     %c1 = arith.constant 1 : index
+    %c1000 = arith.constant 1000 : index
+    %c0 = arith.constant 0 : index
     %alloc = memref.alloc() : memref<1000xf32>
     scf.for %arg1 = %c0 to %c1000 step %c1 {
       %0 = memref.load %arg0[%arg1] : memref<1000xf32>
@@ -3654,344 +1839,340 @@ module @IrToMhlo.2452 {
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16>
     %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16>
     %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16>
     %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16>
     %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16>
     %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_1 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16>
+    %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16>
+    %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %alloc_1 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
     %alloc_2 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
+    byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_3 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_4 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_5 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_6 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_7 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_8 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_10 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    %alloc_10 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
     %alloc_11 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
     %alloc_12 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_13 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_14 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_15 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_16 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_17 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_18 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_19 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_20 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    %alloc_20 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
     %alloc_21 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
     %alloc_22 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_23 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_24 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_25 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_26 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_27 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_28 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_29 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_30 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    %alloc_30 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
     %alloc_31 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
     %alloc_32 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_33 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_34 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_35 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_36 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_37 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_38 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_39 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_40 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16>
+    %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16>
+    %alloc_40 = memref.alloc() : memref<4x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
+    %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16>
+    %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16>
+    %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16>
+    %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
     %alloc_41 = memref.alloc() : memref<4x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16>
-    %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16>
-    %alloc_42 = memref.alloc() : memref<4x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
-    %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16>
-    %alloc_43 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
-    %alloc_44 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>)
-    %alloc_45 = memref.alloc() : memref<4x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
-    %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_46 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_47 = memref.alloc() : memref<512xf32>
+    byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
+    %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_42 = memref.alloc() : memref<4x512x7x7xf16>
+    %alloc_43 = memref.alloc() : memref<512xf32>
+    %alloc_44 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_45 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_46 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_47 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_48 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_49 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_50 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_51 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_52 = memref.alloc() : memref<512xf32>
+    %alloc_49 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_50 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_51 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_52 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_53 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_54 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_55 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_56 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_57 = memref.alloc() : memref<512xf32>
+    %alloc_54 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_55 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_56 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_57 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_58 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_59 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_60 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_61 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_62 = memref.alloc() : memref<512xf32>
+    %alloc_59 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_60 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_61 = memref.alloc() : memref<512x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
+    %alloc_62 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_63 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_64 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_65 = memref.alloc() : memref<512x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
-    %alloc_66 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_67 = memref.alloc() : memref<512xf32>
-    %alloc_68 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_69 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
-    %alloc_70 = memref.alloc() : memref<512x256x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
-    %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_71 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_72 = memref.alloc() : memref<256xf32>
+    %alloc_64 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_65 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
+    %alloc_66 = memref.alloc() : memref<512x256x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
+    %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_67 = memref.alloc() : memref<4x256x14x14xf16>
+    %alloc_68 = memref.alloc() : memref<256xf32>
+    %alloc_69 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_70 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_71 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_72 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_73 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_74 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_75 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_76 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_77 = memref.alloc() : memref<256xf32>
+    %alloc_74 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_75 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_76 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_77 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_78 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_79 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_80 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_81 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_82 = memref.alloc() : memref<256xf32>
+    %alloc_79 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_80 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_81 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_82 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_83 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_84 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_85 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_86 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_87 = memref.alloc() : memref<256xf32>
+    %alloc_84 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_85 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_86 = memref.alloc() : memref<256x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
+    %alloc_87 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_88 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_89 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_90 = memref.alloc() : memref<256x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
-    %alloc_91 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_92 = memref.alloc() : memref<256xf32>
-    %alloc_93 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_94 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
-    %alloc_95 = memref.alloc() : memref<256x128x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
-    %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_96 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_97 = memref.alloc() : memref<128xf32>
+    %alloc_89 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_90 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
+    %alloc_91 = memref.alloc() : memref<256x128x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
+    %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_92 = memref.alloc() : memref<4x128x28x28xf16>
+    %alloc_93 = memref.alloc() : memref<128xf32>
+    %alloc_94 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_95 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_96 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_97 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_98 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_99 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_100 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_101 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_102 = memref.alloc() : memref<128xf32>
+    %alloc_99 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_100 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_101 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_102 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_103 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_104 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_105 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_106 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_107 = memref.alloc() : memref<128xf32>
+    %alloc_104 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_105 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_106 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_107 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_108 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_109 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_110 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_111 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_112 = memref.alloc() : memref<128xf32>
+    %alloc_109 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_110 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_111 = memref.alloc() : memref<128x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
+    %alloc_112 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_113 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_114 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_115 = memref.alloc() : memref<128x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
-    %alloc_116 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_117 = memref.alloc() : memref<128xf32>
-    %alloc_118 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_119 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
-    %alloc_120 = memref.alloc() : memref<128x64x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
-    %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_121 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_122 = memref.alloc() : memref<64xf32>
+    %alloc_114 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_115 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
+    %alloc_116 = memref.alloc() : memref<128x64x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
+    %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_117 = memref.alloc() : memref<4x64x56x56xf16>
+    %alloc_118 = memref.alloc() : memref<64xf32>
+    %alloc_119 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_120 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_121 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_122 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_123 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_124 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_125 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_126 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_127 = memref.alloc() : memref<64xf32>
+    %alloc_124 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_125 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_126 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_127 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_128 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_129 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_130 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_131 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_132 = memref.alloc() : memref<64xf32>
+    %alloc_129 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_130 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_131 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_132 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_133 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_134 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_135 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_136 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_137 = memref.alloc() : memref<64xf32>
-    %alloc_138 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_139 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_140 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_141 = memref.alloc() : memref<4x64x112x112xf16>
-    byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
-    %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
-    %alloc_142 = memref.alloc() : memref<4x64x112x112xf16>
-    %alloc_143 = memref.alloc() : memref<64xf32>
-    %alloc_144 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_145 = memref.alloc() : memref<64x3x7x7xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
-    %alloc_146 = memref.alloc() : memref<f32>
-    byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<f32>
-    %62 = call @Unknown141(%alloc_146) : (memref<f32>) -> memref<f32>
-    %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %alloc_147 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
-    %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %alloc_148 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32>
-    %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32>
-    return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
+    %alloc_134 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_135 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_136 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_137 = memref.alloc() : memref<4x64x112x112xf16>
+    byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
+    %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
+    %alloc_138 = memref.alloc() : memref<4x64x112x112xf16>
+    %alloc_139 = memref.alloc() : memref<64xf32>
+    %alloc_140 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_141 = memref.alloc() : memref<64x3x7x7xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
+    %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref<f32>
+    %69 = call @Unknown148(%68) : (memref<f32>) -> memref<f32>
+    %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
+    %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %alloc_142 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
+    %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32>
+    %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32>
+    return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir b/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir
index a4de5d088..c8fa12017 100644
--- a/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/7_set_space_opt.mlir
@@ -1,4673 +1,2788 @@
-// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
+// RUN: byteir-opt %s -remove-func-body="anchor-attr=__byteir_elementwise_fusion__" -inline -gpu-launch-func-to-byre -set-op-space="entry-func=main space=cuda" -set-arg-space="entry-func=main all-space=cuda" | FileCheck %s
 
 // CHECK-LABEL: func.func @main
 
 module @IrToMhlo.2452 attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
+    gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
       %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        %9 = arith.extf %8 : f16 to f32
+        memref.store %9, %arg1[%arg2] : memref<1000xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
+      %c131072 = arith.constant 131072 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
+    gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
+      %c32768 = arith.constant 32768 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
+      %c128 = arith.constant 128 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+    gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
+      %c8192 = arith.constant 8192 : index
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
+    gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
+      %c73728 = arith.constant 73728 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
+    gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
+      %c36864 = arith.constant 36864 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
+    gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown148(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 4.000000e+00 : f32
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1 step %6 {
+        %7 = memref.load %arg0[] : memref<f32>
+        %8 = arith.negf %7 : f32
+        %9 = arith.divf %8, %cst : f32
+        memref.store %9, %arg1[] : memref<f32>
       }
       gpu.return
     }
-    gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
+    gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+      %c3211264 = arith.constant 3211264 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown141(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
-      %cst = arith.constant 4.000000e+00 : f32
-      %c1 = arith.constant 1 : index
+    gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[] : memref<f32>
-        %7 = arith.negf %6 : f32
-        %8 = arith.divf %7, %cst : f32
-        memref.store %8, %arg1[] : memref<f32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+    gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11] : memref<4x512xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %15 = arith.divf %13, %cst : f16
+        %16 = arith.select %14, %15, %cst_0 : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg6 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg6, %c1000 : index
+        %8 = arith.divsi %arg6, %c1000 : index
+        %9 = memref.load %arg2[%8] : memref<4xf16>
+        %10 = memref.load %arg0[%8] : memref<4xf16>
+        %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16>
+        %13 = arith.subf %11, %10 : f16
+        %14 = math.exp %13 : f16
+        %15 = arith.mulf %14, %9 : f16
+        %16 = arith.subf %12, %15 : f16
+        memref.store %13, %arg4[%8, %7] : memref<4x1000xf16>
+        memref.store %16, %arg5[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel {
+      %c4 = arith.constant 4 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<4xf16>
+        %8 = math.log %7 : f16
+        memref.store %8, %arg1[%arg2] : memref<4xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%8] : memref<4xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.subf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%7] : memref<1000xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.addf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
+      %c2048 = arith.constant 2048 : index
+      %cst = arith.constant 2.040100e-02 : f16
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2048 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x512xf16>
+        %10 = arith.mulf %9, %cst : f16
+        memref.store %10, %arg1[%8, %7] : memref<4x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
       %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
+    gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
+      %c3211264 = arith.constant 3211264 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c64 = arith.constant 64 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29] : memref<4x512xf16>
-        %38 = arith.divf %37, %cst_0 : f16
-        %39 = arith.select %36, %38, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
+    gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel {
       %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %18 = memref.load %arg0[%15] : memref<4xf16>
-        %19 = memref.load %arg2[%15] : memref<4xf16>
-        %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32>
-        %21 = math.log %18 : f16
-        %22 = arith.subf %17, %21 : f16
-        %23 = math.exp %22 : f16
-        %24 = arith.mulf %23, %19 : f16
-        %25 = arith.subf %16, %24 : f16
-        %26 = arith.extf %22 : f16 to f32
-        %27 = arith.mulf %26, %20 : f32
-        %28 = arith.extf %25 : f16 to f32
-        memref.store %25, %arg5[%15, %9] : memref<4x1000xf16>
-        memref.store %27, %arg6[%15, %9] : memref<4x1000xf32>
-        memref.store %28, %arg7[%15, %9] : memref<4x1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        memref.store %8, %arg1[%arg2] : memref<1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c512000 = arith.constant 512000 : index
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%15] : memref<4xf16>
-        %18 = arith.subf %16, %17 : f16
-        %19 = math.exp %18 : f16
-        memref.store %18, %arg2[%15, %9] : memref<4x1000xf16>
-        memref.store %19, %arg3[%15, %9] : memref<4x1000xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel {
       %c4000 = arith.constant 4000 : index
+      %cst = arith.constant -2.500000e-01 : f32
       %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%9] : memref<1000xf32>
-        %18 = arith.truncf %17 : f32 to f16
-        %19 = arith.addf %16, %18 : f16
-        memref.store %19, %arg2[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c2048 = arith.constant 2048 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2048 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x512xf16>
-        %17 = arith.mulf %16, %cst : f16
-        memref.store %17, %arg1[%15, %9] : memref<4x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg2, %c1000 : index
+        %8 = arith.divsi %arg2, %c1000 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32>
+        %10 = arith.mulf %9, %cst : f32
+        %11 = arith.truncf %10 : f32 to f16
+        memref.store %11, %arg1[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
+      %c2359296 = arith.constant 2359296 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
+      %c131072 = arith.constant 131072 : index
       %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+    gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
+      %c294912 = arith.constant 294912 : index
+      %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
+      %c32768 = arith.constant 32768 : index
       %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
+      %c73728 = arith.constant 73728 : index
+      %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
+      %c8192 = arith.constant 8192 : index
       %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
+      %c36864 = arith.constant 36864 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
+      %c9408 = arith.constant 9408 : index
+      %c3 = arith.constant 3 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
+      %c602112 = arith.constant 602112 : index
+      %c3 = arith.constant 3 : index
+      %c224 = arith.constant 224 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c602112 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
+    gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c2 = arith.constant 2 : index
       %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
       %c-1 = arith.constant -1 : index
+      %c512 = arith.constant 512 : index
+      %c-1024 = arith.constant -1024 : index
+      %c1000 = arith.constant 1000 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
+      %c32 = arith.constant 32 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.muli %13, %c2 : index
+      %15 = arith.cmpi slt, %13, %c0 : index
+      %16 = arith.subi %c-1, %13 : index
+      %17 = arith.select %15, %16, %13 : index
+      %18 = arith.divsi %17, %c512 : index
+      %19 = arith.subi %c-1, %18 : index
+      %20 = arith.select %15, %19, %18 : index
+      %21 = arith.muli %20, %c-1024 : index
+      %22 = arith.addi %14, %21 : index
+      %23 = arith.cmpi slt, %22, %c1000 : index
+      %24 = arith.select %23, %22, %c1000 : index
+      %25 = arith.addi %22, %c2 : index
+      %26 = arith.cmpi slt, %25, %c1000 : index
+      %27 = arith.select %26, %25, %c1000 : index
+      %28 = arith.subi %27, %24 : index
+      %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %29 = arith.cmpi ugt, %28, %c0 : index
+      %30 = scf.if %29 -> (f16) {
+        %44 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %44 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %31 = arith.addf %30, %cst : f16
+      %32 = arith.cmpi ugt, %28, %c1 : index
+      %33 = scf.if %32 -> (f16) {
+        %44 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %44 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %34 = arith.addf %31, %33 : f16
+      memref.store %34, %alloca[%13] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %35 = arith.cmpi ult, %13, %c256 : index
+      scf.if %35 {
+        %44 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca[%46] : memref<512xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_2[%13] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %36 = arith.cmpi ult, %13, %c128 : index
+      scf.if %36 {
+        %44 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_2[%46] : memref<256xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_3[%13] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %37 = arith.cmpi ult, %13, %c64 : index
+      scf.if %37 {
+        %44 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_3[%46] : memref<128xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_4[%13] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %38 = arith.cmpi ult, %13, %c32 : index
+      scf.if %38 {
+        %44 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_4[%46] : memref<64xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_5[%13] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %39 = arith.cmpi ult, %13, %c16 : index
+      scf.if %39 {
+        %44 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_5[%46] : memref<32xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_6[%13] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %40 = arith.cmpi ult, %13, %c8 : index
+      scf.if %40 {
+        %44 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_6[%46] : memref<16xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_7[%13] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %41 = arith.cmpi ult, %13, %c4 : index
+      scf.if %41 {
+        %44 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_7[%46] : memref<8xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_8[%13] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %42 = arith.cmpi ult, %13, %c2 : index
+      scf.if %42 {
+        %44 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_8[%46] : memref<4xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %alloca_9[%13] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %43 = arith.cmpi ult, %13, %c1 : index
+      scf.if %43 {
+        %44 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space<workgroup>>
+        %45 = arith.addf %44, %cst : f16
+        %46 = arith.addi %14, %c1 : index
+        %47 = memref.load %alloca_9[%46] : memref<2xf16, #gpu.address_space<workgroup>>
+        %48 = arith.addf %47, %45 : f16
+        memref.store %48, %arg1[%12] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 2048, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c64 = arith.constant 64 : index
+      %c0 = arith.constant 0 : index
+      %c49 = arith.constant 49 : index
+      %c1 = arith.constant 1 : index
       %cst = arith.constant 0.000000e+00 : f16
+      %c32 = arith.constant 32 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.remsi %13, %c64 : index
+      %15 = arith.cmpi slt, %14, %c0 : index
+      %16 = arith.addi %14, %c64 : index
+      %17 = arith.select %15, %16, %14 : index
+      %18 = arith.cmpi slt, %17, %c49 : index
+      %19 = arith.select %18, %17, %c49 : index
+      %20 = arith.addi %17, %c1 : index
+      %21 = arith.cmpi slt, %20, %c49 : index
+      %22 = arith.select %21, %20, %c49 : index
+      %23 = arith.subi %22, %19 : index
+      %subview_0 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %24 = arith.cmpi ugt, %23, %c0 : index
+      %25 = scf.if %24 -> (f16) {
+        %33 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %33 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %26 = arith.addf %25, %cst : f16
+      memref.store %26, %alloca[%13] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %13, %c32 : index
+      scf.if %27 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca[%33] : memref<64xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca[%36] : memref<64xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_2[%13] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %13, %c16 : index
+      scf.if %28 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_2[%33] : memref<32xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_2[%36] : memref<32xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_3[%13] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %13, %c8 : index
+      scf.if %29 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_3[%33] : memref<16xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_3[%36] : memref<16xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_4[%13] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %13, %c4 : index
+      scf.if %30 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_4[%33] : memref<8xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_4[%36] : memref<8xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_5[%13] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %13, %c2 : index
+      scf.if %31 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_5[%33] : memref<4xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_5[%36] : memref<4xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_6[%13] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %32 = arith.cmpi ult, %13, %c1 : index
+      scf.if %32 {
+        %33 = arith.muli %13, %c2 : index
+        %34 = memref.load %alloca_6[%33] : memref<2xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %33, %c1 : index
+        %37 = memref.load %alloca_6[%36] : memref<2xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %arg1[%12] : memref<2048xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c2 = arith.constant 2 : index
       %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
       %c-1 = arith.constant -1 : index
+      %c512 = arith.constant 512 : index
+      %c-1024 = arith.constant -1024 : index
+      %c1000 = arith.constant 1000 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
+      %c32 = arith.constant 32 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.muli %13, %c2 : index
+      %15 = arith.cmpi slt, %13, %c0 : index
+      %16 = arith.subi %c-1, %13 : index
+      %17 = arith.select %15, %16, %13 : index
+      %18 = arith.divsi %17, %c512 : index
+      %19 = arith.subi %c-1, %18 : index
+      %20 = arith.select %15, %19, %18 : index
+      %21 = arith.muli %20, %c-1024 : index
+      %22 = arith.addi %14, %21 : index
+      %23 = arith.cmpi slt, %22, %c1000 : index
+      %24 = arith.select %23, %22, %c1000 : index
+      %25 = arith.addi %22, %c2 : index
+      %26 = arith.cmpi slt, %25, %c1000 : index
+      %27 = arith.select %26, %25, %c1000 : index
+      %28 = arith.subi %27, %24 : index
+      %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %29 = arith.cmpi ugt, %28, %c0 : index
+      %30 = scf.if %29 -> (f16) {
+        %43 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %43 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %31 = arith.cmpi ugt, %28, %c1 : index
+      %32 = scf.if %31 -> (f16) {
+        %43 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %43 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %33 = arith.maximumf %30, %32 : f16
+      memref.store %33, %alloca[%13] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %34 = arith.cmpi ult, %13, %c256 : index
+      scf.if %34 {
+        %43 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca[%44] : memref<512xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_2[%13] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %35 = arith.cmpi ult, %13, %c128 : index
+      scf.if %35 {
+        %43 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_2[%44] : memref<256xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_3[%13] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %36 = arith.cmpi ult, %13, %c64 : index
+      scf.if %36 {
+        %43 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_3[%44] : memref<128xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_4[%13] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %37 = arith.cmpi ult, %13, %c32 : index
+      scf.if %37 {
+        %43 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_4[%44] : memref<64xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_5[%13] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %38 = arith.cmpi ult, %13, %c16 : index
+      scf.if %38 {
+        %43 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_5[%44] : memref<32xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_6[%13] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %39 = arith.cmpi ult, %13, %c8 : index
+      scf.if %39 {
+        %43 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_6[%44] : memref<16xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_7[%13] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %40 = arith.cmpi ult, %13, %c4 : index
+      scf.if %40 {
+        %43 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_7[%44] : memref<8xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_8[%13] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %41 = arith.cmpi ult, %13, %c2 : index
+      scf.if %41 {
+        %43 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_8[%44] : memref<4xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %alloca_9[%13] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %42 = arith.cmpi ult, %13, %c1 : index
+      scf.if %42 {
+        %43 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space<workgroup>>
+        %44 = arith.addi %14, %c1 : index
+        %45 = memref.load %alloca_9[%44] : memref<2xf16, #gpu.address_space<workgroup>>
+        %46 = arith.maximumf %45, %43 : f16
+        memref.store %46, %arg1[%12] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c2 = arith.constant 2 : index
       %c0 = arith.constant 0 : index
-      %c512000 = arith.constant 512000 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel {
-      %cst = arith.constant -2.500000e-01 : f32
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32>
-        %17 = arith.mulf %16, %cst : f32
-        %18 = arith.truncf %17 : f32 to f16
-        memref.store %18, %arg1[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
       %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+      %c-1024 = arith.constant -1024 : index
+      %c1000 = arith.constant 1000 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
       %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
-      %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
+      %c32 = arith.constant 32 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.muli %13, %c2 : index
+      %15 = arith.cmpi slt, %13, %c0 : index
+      %16 = arith.subi %c-1, %13 : index
+      %17 = arith.select %15, %16, %13 : index
+      %18 = arith.divsi %17, %c512 : index
+      %19 = arith.subi %c-1, %18 : index
+      %20 = arith.select %15, %19, %18 : index
+      %21 = arith.muli %20, %c-1024 : index
+      %22 = arith.addi %14, %21 : index
+      %23 = arith.cmpi slt, %22, %c1000 : index
+      %24 = arith.select %23, %22, %c1000 : index
+      %25 = arith.addi %22, %c2 : index
+      %26 = arith.cmpi slt, %25, %c1000 : index
+      %27 = arith.select %26, %25, %c1000 : index
+      %28 = arith.subi %27, %24 : index
+      %subview_0 = memref.subview %expand_shape[0, %24] [1, %28] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %29 = arith.cmpi ugt, %28, %c0 : index
+      %30 = scf.if %29 -> (f16) {
+        %46 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %46 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %31 = math.exp %30 : f16
+      %32 = arith.addf %31, %cst : f16
+      %33 = arith.cmpi ugt, %28, %c1 : index
+      %34 = scf.if %33 -> (f16) {
+        %46 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %46 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %35 = math.exp %34 : f16
+      %36 = arith.addf %32, %35 : f16
+      memref.store %36, %alloca[%13] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %37 = arith.cmpi ult, %13, %c256 : index
+      scf.if %37 {
+        %46 = memref.load %alloca[%14] : memref<512xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca[%48] : memref<512xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_2[%13] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %38 = arith.cmpi ult, %13, %c128 : index
+      scf.if %38 {
+        %46 = memref.load %alloca_2[%14] : memref<256xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_2[%48] : memref<256xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_3[%13] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %39 = arith.cmpi ult, %13, %c64 : index
+      scf.if %39 {
+        %46 = memref.load %alloca_3[%14] : memref<128xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_3[%48] : memref<128xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_4[%13] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %40 = arith.cmpi ult, %13, %c32 : index
+      scf.if %40 {
+        %46 = memref.load %alloca_4[%14] : memref<64xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_4[%48] : memref<64xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_5[%13] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %41 = arith.cmpi ult, %13, %c16 : index
+      scf.if %41 {
+        %46 = memref.load %alloca_5[%14] : memref<32xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_5[%48] : memref<32xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_6[%13] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %42 = arith.cmpi ult, %13, %c8 : index
+      scf.if %42 {
+        %46 = memref.load %alloca_6[%14] : memref<16xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_6[%48] : memref<16xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_7[%13] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %43 = arith.cmpi ult, %13, %c4 : index
+      scf.if %43 {
+        %46 = memref.load %alloca_7[%14] : memref<8xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_7[%48] : memref<8xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_8[%13] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %44 = arith.cmpi ult, %13, %c2 : index
+      scf.if %44 {
+        %46 = memref.load %alloca_8[%14] : memref<4xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_8[%48] : memref<4xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %alloca_9[%13] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %45 = arith.cmpi ult, %13, %c1 : index
+      scf.if %45 {
+        %46 = memref.load %alloca_9[%14] : memref<2xf16, #gpu.address_space<workgroup>>
+        %47 = arith.addf %46, %cst : f16
+        %48 = arith.addi %14, %c1 : index
+        %49 = memref.load %alloca_9[%48] : memref<2xf16, #gpu.address_space<workgroup>>
+        %50 = arith.addf %49, %47 : f16
+        memref.store %50, %arg1[%12] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c128 = arith.constant 128 : index
       %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+      %c125 = arith.constant 125 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f32
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %c16 = arith.constant 16 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %subview = memref.subview %arg0[%12, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_1 = memref.subview %arg1[%12, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.remsi %13, %c128 : index
+      %15 = arith.cmpi slt, %14, %c0 : index
+      %16 = arith.addi %14, %c128 : index
+      %17 = arith.select %15, %16, %14 : index
+      %18 = arith.cmpi slt, %17, %c125 : index
+      %19 = arith.select %18, %17, %c125 : index
+      %20 = arith.addi %17, %c1 : index
+      %21 = arith.cmpi slt, %20, %c125 : index
+      %22 = arith.select %21, %20, %c125 : index
+      %23 = arith.subi %22, %19 : index
+      %subview_3 = memref.subview %expand_shape[0, %19] [1, %23] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[0, %19] [1, %23] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+      %24 = arith.cmpi ugt, %23, %c0 : index
+      %25:2 = scf.if %24 -> (f16, f32) {
+        %36 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %37 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+        scf.yield %36, %37 : f16, f32
+      } else {
+        scf.yield %cst, %cst_0 : f16, f32
+      }
+      %26 = arith.extf %25#0 : f16 to f32
+      %27 = arith.mulf %26, %25#1 : f32
+      %28 = arith.addf %27, %cst_0 : f32
+      memref.store %28, %alloca[%13] : memref<128xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %13, %c64 : index
+      scf.if %29 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca[%36] : memref<128xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca[%39] : memref<128xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_7[%13] : memref<64xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %13, %c32 : index
+      scf.if %30 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_7[%36] : memref<64xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_7[%39] : memref<64xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_8[%13] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %13, %c16 : index
+      scf.if %31 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_8[%36] : memref<32xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_8[%39] : memref<32xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_9[%13] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %32 = arith.cmpi ult, %13, %c8 : index
+      scf.if %32 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_9[%36] : memref<16xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_9[%39] : memref<16xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_10[%13] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %33 = arith.cmpi ult, %13, %c4 : index
+      scf.if %33 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_10[%36] : memref<8xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_10[%39] : memref<8xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_11[%13] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %34 = arith.cmpi ult, %13, %c2 : index
+      scf.if %34 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_11[%36] : memref<4xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_11[%39] : memref<4xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %alloca_12[%13] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %35 = arith.cmpi ult, %13, %c1 : index
+      scf.if %35 {
+        %36 = arith.muli %13, %c2 : index
+        %37 = memref.load %alloca_12[%36] : memref<2xf32, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %cst_0 : f32
+        %39 = arith.addi %36, %c1 : index
+        %40 = memref.load %alloca_12[%39] : memref<2xf32, #gpu.address_space<workgroup>>
+        %41 = arith.addf %40, %38 : f32
+        memref.store %41, %arg2[%12] : memref<32xf32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref<f32>) kernel attributes {gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %c4 = arith.constant 4 : index
+      %12 = gpu.block_id  x
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %13 = gpu.thread_id  x
+      %14 = arith.muli %12, %c32 : index
+      %15 = arith.addi %14, %13 : index
+      %16 = memref.load %arg0[%15] : memref<32xf32>
+      %17 = arith.addf %16, %cst : f32
+      memref.store %17, %alloca[%13] : memref<32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %13, %c16 : index
+      scf.if %18 {
+        %23 = arith.muli %13, %c2 : index
+        %24 = memref.load %alloca[%23] : memref<32xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %cst : f32
+        %26 = arith.addi %23, %c1 : index
+        %27 = memref.load %alloca[%26] : memref<32xf32, #gpu.address_space<workgroup>>
+        %28 = arith.addf %27, %25 : f32
+        memref.store %28, %alloca_0[%13] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %13, %c8 : index
+      scf.if %19 {
+        %23 = arith.muli %13, %c2 : index
+        %24 = memref.load %alloca_0[%23] : memref<16xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %cst : f32
+        %26 = arith.addi %23, %c1 : index
+        %27 = memref.load %alloca_0[%26] : memref<16xf32, #gpu.address_space<workgroup>>
+        %28 = arith.addf %27, %25 : f32
+        memref.store %28, %alloca_1[%13] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %20 = arith.cmpi ult, %13, %c4 : index
+      scf.if %20 {
+        %23 = arith.muli %13, %c2 : index
+        %24 = memref.load %alloca_1[%23] : memref<8xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %cst : f32
+        %26 = arith.addi %23, %c1 : index
+        %27 = memref.load %alloca_1[%26] : memref<8xf32, #gpu.address_space<workgroup>>
+        %28 = arith.addf %27, %25 : f32
+        memref.store %28, %alloca_2[%13] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %21 = arith.cmpi ult, %13, %c2 : index
+      scf.if %21 {
+        %23 = arith.muli %13, %c2 : index
+        %24 = memref.load %alloca_2[%23] : memref<4xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %cst : f32
+        %26 = arith.addi %23, %c1 : index
+        %27 = memref.load %alloca_2[%26] : memref<4xf32, #gpu.address_space<workgroup>>
+        %28 = arith.addf %27, %25 : f32
+        memref.store %28, %alloca_3[%13] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %22 = arith.cmpi ult, %13, %c1 : index
+      scf.if %22 {
+        %23 = arith.muli %13, %c2 : index
+        %24 = memref.load %alloca_3[%23] : memref<2xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %cst : f32
+        %26 = arith.addi %23, %c1 : index
+        %27 = memref.load %alloca_3[%26] : memref<2xf32, #gpu.address_space<workgroup>>
+        %28 = arith.addf %27, %25 : f32
+        memref.store %28, %arg1[] : memref<f32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array<i32: 32, 2, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = gpu.block_id  z
+      %3 = gpu.thread_id  x
+      %4 = gpu.thread_id  y
+      %5 = gpu.thread_id  z
+      %6 = gpu.grid_dim  x
+      %7 = gpu.grid_dim  y
+      %8 = gpu.grid_dim  z
+      %9 = gpu.block_dim  x
+      %10 = gpu.block_dim  y
+      %11 = gpu.block_dim  z
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %c-32 = arith.constant -32 : index
+      %c1000 = arith.constant 1000 : index
+      %c32 = arith.constant 32 : index
+      %c1 = arith.constant 1 : index
       %c0 = arith.constant 0 : index
-      %c602112 = arith.constant 602112 : index
-      %c224 = arith.constant 224 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c602112 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16>
-      }
-      gpu.return
-    }
-  }
-  func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4704 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+      %c2 = arith.constant 2 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f32
+      %12 = gpu.block_id  x
+      %13 = arith.muli %12, %c-32 : index
+      %14 = arith.addi %13, %c1000 : index
+      %15 = arith.cmpi slt, %14, %c32 : index
+      %16 = arith.select %15, %14, %c32 : index
+      %17 = arith.muli %12, %c32 : index
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      %18 = gpu.thread_id  x
+      %19 = gpu.thread_id  y
+      %20 = arith.cmpi slt, %16, %18 : index
+      %21 = arith.select %20, %16, %18 : index
+      %22 = arith.addi %18, %c1 : index
+      %23 = arith.cmpi slt, %16, %22 : index
+      %24 = arith.select %23, %16, %22 : index
+      %25 = arith.subi %24, %21 : index
+      %26 = arith.cmpi ugt, %25, %c0 : index
+      %27 = scf.if %26 -> (f16) {
+        %34 = arith.muli %19, %c2 : index
+        %35 = arith.addi %17, %21 : index
+        %36 = memref.load %arg0[%34, %35] : memref<4x1000xf16>
+        scf.yield %36 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %28 = arith.extf %27 : f16 to f32
+      %29 = arith.addf %28, %cst_0 : f32
+      %30 = scf.if %26 -> (f16) {
+        %34 = arith.muli %19, %c2 : index
+        %35 = arith.addi %34, %c1 : index
+        %36 = arith.addi %17, %21 : index
+        %37 = memref.load %arg0[%35, %36] : memref<4x1000xf16>
+        scf.yield %37 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %31 = arith.extf %30 : f16 to f32
+      %32 = arith.addf %29, %31 : f32
+      memref.store %32, %alloca_1[%19, %18] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %33 = arith.cmpi ult, %19, %c1 : index
+      scf.if %33 {
+        %34 = memref.load %alloca_1[%c0, %18] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst_0 : f32
+        %36 = memref.load %alloca_1[%c1, %18] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %37 = arith.addf %36, %35 : f32
+        memref.store %37, %alloca[%18] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %subview = memref.subview %alloca[0] [%16] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %arg1[%17] [%16] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
+      gpu.return
+    }
+  }
+  func.func private @Unknown0(%arg0: memref<4x3x224x224xf32>) -> memref<4x3x224x224xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 588 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c588 = arith.constant 588 : index
     %c1 = arith.constant 1 : index
-    %c4704 = arith.constant 4704 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x3x224x224xf16>
-    gpu.launch_func  @unified::@Unknown0 blocks in (%c4704, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x3x224x224xf32>, %alloc : memref<4x3x224x224xf16>)
+    gpu.launch_func  @unified::@Unknown0 blocks in (%c588, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x3x224x224xf32>, %alloc : memref<4x3x224x224xf16>)
     return %alloc : memref<4x3x224x224xf16>
   }
-  func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown1(%arg0: memref<64x3x7x7xf32>) -> memref<64x3x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c10 = arith.constant 10 : index
     %c1 = arith.constant 1 : index
-    %c74 = arith.constant 74 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf16>
-    gpu.launch_func  @unified::@Unknown1 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown1 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x3x7x7xf32>, %alloc : memref<64x3x7x7xf16>)
     return %alloc : memref<64x3x7x7xf16>
   }
-  func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown3 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown4(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown4 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown5(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown5", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown3(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c36 = arith.constant 36 : index
     %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown5 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
-    return %alloc : memref<64x64x3x3xf16>
-  }
-  func.func private @Unknown6(%arg0: memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown6 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown3 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x64x3x3xf32>, %alloc : memref<64x64x3x3xf16>)
     return %alloc : memref<64x64x3x3xf16>
   }
-  func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown7(%arg0: memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c8 = arith.constant 8 : index
     %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf16>
-    gpu.launch_func  @unified::@Unknown7 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>)
+    gpu.launch_func  @unified::@Unknown7 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x1x1xf32>, %alloc : memref<128x64x1x1xf16>)
     return %alloc : memref<128x64x1x1xf16>
   }
-  func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown8(%arg0: memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c72 = arith.constant 72 : index
     %c1 = arith.constant 1 : index
-    %c576 = arith.constant 576 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf16>
-    gpu.launch_func  @unified::@Unknown8 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown8 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x3x3xf32>, %alloc : memref<128x64x3x3xf16>)
     return %alloc : memref<128x64x3x3xf16>
   }
-  func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown9 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown10(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown9(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c144 = arith.constant 144 : index
     %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown10 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
-    return %alloc : memref<128x128x3x3xf16>
-  }
-  func.func private @Unknown11(%arg0: memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown11", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown11 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown9 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x128x3x3xf32>, %alloc : memref<128x128x3x3xf16>)
     return %alloc : memref<128x128x3x3xf16>
   }
-  func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown12(%arg0: memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c32 = arith.constant 32 : index
     %c1 = arith.constant 1 : index
     %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf16>
-    gpu.launch_func  @unified::@Unknown12 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>)
+    gpu.launch_func  @unified::@Unknown12 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x1x1xf32>, %alloc : memref<256x128x1x1xf16>)
     return %alloc : memref<256x128x1x1xf16>
   }
-  func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown13(%arg0: memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c288 = arith.constant 288 : index
     %c1 = arith.constant 1 : index
-    %c2304 = arith.constant 2304 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf16>
-    gpu.launch_func  @unified::@Unknown13 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown13 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x3x3xf32>, %alloc : memref<256x128x3x3xf16>)
     return %alloc : memref<256x128x3x3xf16>
   }
-  func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown14 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown15(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown15 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
-    return %alloc : memref<256x256x3x3xf16>
-  }
-  func.func private @Unknown16(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown14(%arg0: memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c576 = arith.constant 576 : index
     %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown16 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown14 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x256x3x3xf32>, %alloc : memref<256x256x3x3xf16>)
     return %alloc : memref<256x256x3x3xf16>
   }
-  func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown17(%arg0: memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %c1024 = arith.constant 1024 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf16>
-    gpu.launch_func  @unified::@Unknown17 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>)
+    gpu.launch_func  @unified::@Unknown17 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x1x1xf32>, %alloc : memref<512x256x1x1xf16>)
     return %alloc : memref<512x256x1x1xf16>
   }
-  func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown18(%arg0: memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1152 = arith.constant 1152 : index
     %c1 = arith.constant 1 : index
-    %c9216 = arith.constant 9216 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf16>
-    gpu.launch_func  @unified::@Unknown18 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown18 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x3x3xf32>, %alloc : memref<512x256x3x3xf16>)
     return %alloc : memref<512x256x3x3xf16>
   }
-  func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown19 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown20(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown20 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
-    return %alloc : memref<512x512x3x3xf16>
-  }
-  func.func private @Unknown21(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown19(%arg0: memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c2304 = arith.constant 2304 : index
     %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf16>
-    gpu.launch_func  @unified::@Unknown21 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
+    gpu.launch_func  @unified::@Unknown19 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x512x3x3xf32>, %alloc : memref<512x512x3x3xf16>)
     return %alloc : memref<512x512x3x3xf16>
   }
-  func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown22(%arg0: memref<4x1000xf32>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c4 = arith.constant 4 : index
     %c1 = arith.constant 1 : index
-    %c32 = arith.constant 32 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    gpu.launch_func  @unified::@Unknown22 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x1000xf32>, %alloc : memref<4x1000xf16>)
+    gpu.launch_func  @unified::@Unknown22 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x1000xf32>, %alloc : memref<4x1000xf16>)
     return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown23(%arg0: memref<1000x512xf32>) -> memref<1000x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c500 = arith.constant 500 : index
     %c1 = arith.constant 1 : index
-    %c4000 = arith.constant 4000 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1000x512xf16>
-    gpu.launch_func  @unified::@Unknown23 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>)
+    gpu.launch_func  @unified::@Unknown23 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000x512xf32>, %alloc : memref<1000x512xf16>)
     return %alloc : memref<1000x512xf16>
   }
-  func.func private @Unknown24(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown24(%arg0: memref<1000xf32>) -> memref<1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<1000xf16>
+    gpu.launch_func  @unified::@Unknown24 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf16>)
+    return %alloc : memref<1000xf16>
+  }
+  func.func private @Unknown25(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
+    %c256 = arith.constant 256 : index
+    %c4 = arith.constant 4 : index
+    %c1000 = arith.constant 1000 : index
+    %c-1024 = arith.constant -1024 : index
+    %c-1 = arith.constant -1 : index
+    %c512 = arith.constant 512 : index
+    %c2 = arith.constant 2 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    gpu.launch_func  @unified::@Unknown25_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1)  args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>)
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown26(%arg0: memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c3136 = arith.constant 3136 : index
     %c1 = arith.constant 1 : index
-    %c25088 = arith.constant 25088 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x112x112xi1>
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16>
-    gpu.launch_func  @unified::@Unknown24 blocks in (%c25088, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x112x112xf16>, %alloc_0 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xi1>)
+    gpu.launch_func  @unified::@Unknown26 blocks in (%c3136, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x112x112xf16>, %alloc_0 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xi1>)
     return %alloc_0, %alloc : memref<4x64x112x112xf16>, memref<4x64x112x112xi1>
   }
-  func.func private @Unknown26(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xi1>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown26 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
-    return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xi1>
-    %alloc_0 = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown28 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
-    return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
-  }
-  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown28(%arg0: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x56x56xi1>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown30 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
+    gpu.launch_func  @unified::@Unknown28 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
     return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x56x56xi1>
     %alloc_0 = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown32 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
+    gpu.launch_func  @unified::@Unknown30 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc_0 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xi1>)
     return %alloc_0, %alloc : memref<4x64x56x56xf16>, memref<4x64x56x56xi1>
   }
-  func.func private @Unknown35(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xi1>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown35 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
-    return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xi1>
-    %alloc_0 = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown37 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
-    return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
-  }
-  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown37(%arg0: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c392 = arith.constant 392 : index
     %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x128x28x28xi1>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown39 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
+    gpu.launch_func  @unified::@Unknown37 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
     return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c392 = arith.constant 392 : index
     %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x128x28x28xi1>
     %alloc_0 = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown41 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
+    gpu.launch_func  @unified::@Unknown39 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %alloc_0 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xi1>)
     return %alloc_0, %alloc : memref<4x128x28x28xf16>, memref<4x128x28x28xi1>
   }
-  func.func private @Unknown44(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xi1>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown44 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
-    return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xi1>
-    %alloc_0 = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown46 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
-    return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
-  }
-  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown46(%arg0: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xi1>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown48 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
+    gpu.launch_func  @unified::@Unknown46 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
     return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xi1>
     %alloc_0 = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown50 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
+    gpu.launch_func  @unified::@Unknown48 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %alloc_0 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xi1>)
     return %alloc_0, %alloc : memref<4x256x14x14xf16>, memref<4x256x14x14xi1>
   }
-  func.func private @Unknown53(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown53", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xi1>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown53 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
-    return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
-  }
-  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown55(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x512x7x7xi1>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown55 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
+    gpu.launch_func  @unified::@Unknown55 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
     return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x512x7x7xi1>
     %alloc_0 = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown57 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
+    gpu.launch_func  @unified::@Unknown57 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
     return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
   }
-  func.func private @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown62(%arg0: memref<4x512x7x7xf16>) -> memref<4x512xf16> attributes {__byteir_reduction_fusion__} {
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c2048 = arith.constant 2048 : index
+    %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xi1>
-    %alloc_0 = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown59 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %alloc_0 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xi1>)
-    return %alloc_0, %alloc : memref<4x512x7x7xf16>, memref<4x512x7x7xi1>
+    %c49 = arith.constant 49 : index
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16> into memref<2048x49xf16>
+    %alloc = memref.alloc() : memref<2048xf16>
+    gpu.launch_func  @unified::@Unknown62_kernel blocks in (%c2048, %c1, %c1) threads in (%c64, %c1, %c1)  args(%collapse_shape : memref<2048x49xf16>, %alloc : memref<2048xf16>)
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] : memref<2048xf16> into memref<4x512xf16>
+    return %expand_shape : memref<4x512xf16>
   }
-  func.func private @Unknown60(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 16 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown63(%arg0: memref<4x512xf16>) -> memref<4x512xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c2 = arith.constant 2 : index
     %c1 = arith.constant 1 : index
-    %c16 = arith.constant 16 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x512xf16>
-    gpu.launch_func  @unified::@Unknown60 blocks in (%c16, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512xf16>, %alloc : memref<4x512xf16>)
+    gpu.launch_func  @unified::@Unknown63 blocks in (%c2, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512xf16>, %alloc : memref<4x512xf16>)
     return %alloc : memref<4x512xf16>
   }
-  func.func private @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c4 = arith.constant 4 : index
     %c1 = arith.constant 1 : index
-    %c32 = arith.constant 32 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    gpu.launch_func  @unified::@Unknown61 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>)
+    gpu.launch_func  @unified::@Unknown64 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000xf16>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>)
     return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown65(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
+    %c256 = arith.constant 256 : index
+    %c4 = arith.constant 4 : index
+    %c1000 = arith.constant 1000 : index
+    %c-1024 = arith.constant -1024 : index
+    %c-1 = arith.constant -1 : index
+    %c512 = arith.constant 512 : index
+    %c2 = arith.constant 2 : index
+    %cst = arith.constant 0.000000e+00 : f16
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    gpu.launch_func  @unified::@Unknown65_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1)  args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>)
+    return %alloc : memref<4xf16>
+  }
+  func.func private @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>) -> memref<4x1000xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c4 = arith.constant 4 : index
     %c1 = arith.constant 1 : index
-    %c32 = arith.constant 32 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x1000xf16>
-    %alloc_0 = memref.alloc() : memref<4x1000xf16>
-    gpu.launch_func  @unified::@Unknown62 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>)
-    return %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf16>
+    gpu.launch_func  @unified::@Unknown66 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>)
+    return %alloc : memref<4x1000xf16>
   }
-  func.func private @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32, 6 : i32, 7 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown67(%arg0: memref<4x1000xf16>) -> memref<4xf16> attributes {__byteir_reduction_fusion__} {
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+    %c64 = arith.constant 64 : index
     %c128 = arith.constant 128 : index
+    %c256 = arith.constant 256 : index
+    %c4 = arith.constant 4 : index
+    %c1000 = arith.constant 1000 : index
+    %c-1024 = arith.constant -1024 : index
+    %c-1 = arith.constant -1 : index
+    %c512 = arith.constant 512 : index
+    %c2 = arith.constant 2 : index
+    %cst = arith.constant 0.000000e+00 : f16
     %c1 = arith.constant 1 : index
-    %c32 = arith.constant 32 : index
-    %alloc = memref.alloc() : memref<4x1000xf32>
-    %alloc_0 = memref.alloc() : memref<4x1000xf32>
-    %alloc_1 = memref.alloc() : memref<4x1000xf16>
-    gpu.launch_func  @unified::@Unknown63 blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %arg2 : memref<4xf16>, %arg3 : memref<4x1000xf16>, %arg4 : memref<4x1000xf32>, %alloc_1 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf32>, %alloc : memref<4x1000xf32>)
-    return %alloc_1, %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    gpu.launch_func  @unified::@Unknown67_kernel blocks in (%c4, %c1, %c1) threads in (%c512, %c1, %c1)  args(%arg0 : memref<4x1000xf16>, %alloc : memref<4xf16>)
+    return %alloc : memref<4xf16>
   }
-  func.func private @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown68(%arg0: memref<4xf16>) -> memref<4xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown64 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512xf16>, %arg1 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>)
-    return %alloc : memref<4x512x7x7xf16>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<4xf16>
+    gpu.launch_func  @unified::@Unknown68 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4xf16>, %alloc : memref<4xf16>)
+    return %alloc : memref<4xf16>
   }
-  func.func private @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c4 = arith.constant 4 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
-    %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown68 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>)
-    return %alloc : memref<4x512x7x7xf16>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<4x1000xf16>
+    %alloc_0 = memref.alloc() : memref<4x1000xf16>
+    gpu.launch_func  @unified::@Unknown69 blocks in (%c4, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4xf16>, %arg1 : memref<4x1000xf16>, %arg2 : memref<4xf16>, %arg3 : memref<4x1000xf16>, %alloc_0 : memref<4x1000xf16>, %alloc : memref<4x1000xf16>)
+    return %alloc_0, %alloc : memref<4x1000xf16>, memref<4x1000xf16>
   }
-  func.func private @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown72 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %arg2 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown70 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512xf16>, %arg1 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>)
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c784 = arith.constant 784 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x512x7x7xf16>
-    gpu.launch_func  @unified::@Unknown76 blocks in (%c784, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>)
+    gpu.launch_func  @unified::@Unknown74 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512x7x7xi1>, %arg1 : memref<4x512x7x7xf16>, %alloc : memref<4x512x7x7xf16>)
     return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown83 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>)
-    return %alloc : memref<4x256x14x14xf16>
-  }
-  func.func private @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c98 = arith.constant 98 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
-    %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown87 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>)
-    return %alloc : memref<4x256x14x14xf16>
+    %c256 = arith.constant 256 : index
+    %alloc = memref.alloc() : memref<4x512x7x7xf16>
+    gpu.launch_func  @unified::@Unknown78 blocks in (%c98, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x512x7x7xf16>, %arg1 : memref<4x512x7x7xf16>, %arg2 : memref<4x512x7x7xi1>, %alloc : memref<4x512x7x7xf16>)
+    return %alloc : memref<4x512x7x7xf16>
   }
-  func.func private @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown91 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>)
+    gpu.launch_func  @unified::@Unknown89 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x256x14x14xf16>, %arg1 : memref<4x256x14x14xf16>, %arg2 : memref<4x256x14x14xi1>, %alloc : memref<4x256x14x14xf16>)
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c196 = arith.constant 196 : index
     %c1 = arith.constant 1 : index
-    %c1568 = arith.constant 1568 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x256x14x14xf16>
-    gpu.launch_func  @unified::@Unknown95 blocks in (%c1568, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>)
+    gpu.launch_func  @unified::@Unknown93 blocks in (%c196, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x256x14x14xi1>, %arg1 : memref<4x256x14x14xf16>, %alloc : memref<4x256x14x14xf16>)
     return %alloc : memref<4x256x14x14xf16>
   }
-  func.func private @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown102", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown102 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>)
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown106", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
-    %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown106 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>)
-    return %alloc : memref<4x128x28x28xf16>
-  }
-  func.func private @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown110", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown108", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c392 = arith.constant 392 : index
     %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown110 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>)
+    gpu.launch_func  @unified::@Unknown108 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x128x28x28xf16>, %arg1 : memref<4x128x28x28xf16>, %arg2 : memref<4x128x28x28xi1>, %alloc : memref<4x128x28x28xf16>)
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown114", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown112", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c392 = arith.constant 392 : index
     %c1 = arith.constant 1 : index
-    %c3136 = arith.constant 3136 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x128x28x28xf16>
-    gpu.launch_func  @unified::@Unknown114 blocks in (%c3136, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>)
+    gpu.launch_func  @unified::@Unknown112 blocks in (%c392, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x128x28x28xi1>, %arg1 : memref<4x128x28x28xf16>, %alloc : memref<4x128x28x28xf16>)
     return %alloc : memref<4x128x28x28xf16>
   }
-  func.func private @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown121", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown121 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>)
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown125", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
-    %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown125 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>)
-    return %alloc : memref<4x64x56x56xf16>
-  }
-  func.func private @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown129", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown127", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown129 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown127 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %arg2 : memref<4x64x56x56xi1>, %alloc : memref<4x64x56x56xf16>)
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown133", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown131", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown133 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown131 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x56x56xi1>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>)
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown137", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c784 = arith.constant 784 : index
     %c1 = arith.constant 1 : index
-    %c6272 = arith.constant 6272 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x56x56xf16>
-    gpu.launch_func  @unified::@Unknown137 blocks in (%c6272, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>)
+    gpu.launch_func  @unified::@Unknown143 blocks in (%c784, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x56x56xf16>, %arg1 : memref<4x64x56x56xf16>, %alloc : memref<4x64x56x56xf16>)
     return %alloc : memref<4x64x56x56xf16>
   }
-  func.func private @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown138", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c3136 = arith.constant 3136 : index
     %c1 = arith.constant 1 : index
-    %c25088 = arith.constant 25088 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<4x64x112x112xf16>
-    gpu.launch_func  @unified::@Unknown138 blocks in (%c25088, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<4x64x112x112xi1>, %arg1 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xf16>)
+    gpu.launch_func  @unified::@Unknown144 blocks in (%c3136, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<4x64x112x112xi1>, %arg1 : memref<4x64x112x112xf16>, %alloc : memref<4x64x112x112xf16>)
     return %alloc : memref<4x64x112x112xf16>
   }
-  func.func private @Unknown141(%arg0: memref<f32>) -> memref<f32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown141", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown147(%arg0: memref<4x1000xf16>, %arg1: memref<4x1000xf32>) -> memref<f32> attributes {__byteir_reduction_fusion__} {
+    %c4 = arith.constant 4 : index
+    %c8 = arith.constant 8 : index
+    %c16 = arith.constant 16 : index
+    %c64 = arith.constant 64 : index
+    %c32 = arith.constant 32 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c125 = arith.constant 125 : index
     %c128 = arith.constant 128 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<f32>
+    %collapse_shape = memref.collapse_shape %arg0 [[0, 1]] : memref<4x1000xf16> into memref<4000xf16>
+    %collapse_shape_1 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32> into memref<4000xf32>
+    %expand_shape = memref.expand_shape %collapse_shape [[0, 1]] : memref<4000xf16> into memref<32x125xf16>
+    %expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] : memref<4000xf32> into memref<32x125xf32>
+    %alloc_3 = memref.alloc() : memref<32xf32>
+    gpu.launch_func  @unified::@Unknown147_kernel blocks in (%c32, %c1, %c1) threads in (%c128, %c1, %c1)  args(%expand_shape : memref<32x125xf16>, %expand_shape_2 : memref<32x125xf32>, %alloc_3 : memref<32xf32>)
+    gpu.launch_func  @unified::@Unknown147_kernel_0 blocks in (%c1, %c1, %c1) threads in (%c32, %c1, %c1)  args(%alloc_3 : memref<32xf32>, %alloc : memref<f32>)
+    return %alloc : memref<f32>
+  }
+  func.func private @Unknown148(%arg0: memref<f32>) -> memref<f32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<f32>
-    gpu.launch_func  @unified::@Unknown141 blocks in (%c1, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<f32>, %alloc : memref<f32>)
+    gpu.launch_func  @unified::@Unknown148 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<f32>, %alloc : memref<f32>)
     return %alloc : memref<f32>
   }
-  func.func private @Unknown142(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown142", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown149(%arg0: memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c10 = arith.constant 10 : index
     %c1 = arith.constant 1 : index
-    %c74 = arith.constant 74 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x3x7x7xf32>
-    gpu.launch_func  @unified::@Unknown142 blocks in (%c74, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>)
+    gpu.launch_func  @unified::@Unknown149 blocks in (%c10, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x3x7x7xf16>, %alloc : memref<64x3x7x7xf32>)
     return %alloc : memref<64x3x7x7xf32>
   }
-  func.func private @Unknown143(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown143 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown144(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown144 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown145(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown145", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
-    %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown145 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
-    return %alloc : memref<64x64x3x3xf32>
-  }
-  func.func private @Unknown146(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown146", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown150(%arg0: memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c36 = arith.constant 36 : index
     %c1 = arith.constant 1 : index
-    %c288 = arith.constant 288 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<64x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown146 blocks in (%c288, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown150 blocks in (%c36, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<64x64x3x3xf16>, %alloc : memref<64x64x3x3xf32>)
     return %alloc : memref<64x64x3x3xf32>
   }
-  func.func private @Unknown147(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown147", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown154(%arg0: memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c72 = arith.constant 72 : index
     %c1 = arith.constant 1 : index
-    %c576 = arith.constant 576 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x3x3xf32>
-    gpu.launch_func  @unified::@Unknown147 blocks in (%c576, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown154 blocks in (%c72, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x3x3xf16>, %alloc : memref<128x64x3x3xf32>)
     return %alloc : memref<128x64x3x3xf32>
   }
-  func.func private @Unknown148(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown155(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c144 = arith.constant 144 : index
     %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown148 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown155 blocks in (%c144, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
     return %alloc : memref<128x128x3x3xf32>
   }
-  func.func private @Unknown149(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown156(%arg0: memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c8 = arith.constant 8 : index
     %c1 = arith.constant 1 : index
-    %c64 = arith.constant 64 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<128x64x1x1xf32>
-    gpu.launch_func  @unified::@Unknown149 blocks in (%c64, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown156 blocks in (%c8, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<128x64x1x1xf16>, %alloc : memref<128x64x1x1xf32>)
     return %alloc : memref<128x64x1x1xf32>
   }
-  func.func private @Unknown150(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown150 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown151(%arg0: memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown151", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c1152 = arith.constant 1152 : index
-    %alloc = memref.alloc() : memref<128x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown151 blocks in (%c1152, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<128x128x3x3xf16>, %alloc : memref<128x128x3x3xf32>)
-    return %alloc : memref<128x128x3x3xf32>
-  }
-  func.func private @Unknown152(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown152", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown159(%arg0: memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c288 = arith.constant 288 : index
     %c1 = arith.constant 1 : index
-    %c2304 = arith.constant 2304 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x3x3xf32>
-    gpu.launch_func  @unified::@Unknown152 blocks in (%c2304, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown159 blocks in (%c288, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x3x3xf16>, %alloc : memref<256x128x3x3xf32>)
     return %alloc : memref<256x128x3x3xf32>
   }
-  func.func private @Unknown153(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown153", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown160(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c576 = arith.constant 576 : index
     %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown153 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown160 blocks in (%c576, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
     return %alloc : memref<256x256x3x3xf32>
   }
-  func.func private @Unknown154(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown161(%arg0: memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c32 = arith.constant 32 : index
     %c1 = arith.constant 1 : index
     %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<256x128x1x1xf32>
-    gpu.launch_func  @unified::@Unknown154 blocks in (%c256, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown161 blocks in (%c32, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<256x128x1x1xf16>, %alloc : memref<256x128x1x1xf32>)
     return %alloc : memref<256x128x1x1xf32>
   }
-  func.func private @Unknown155(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown155 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown156(%arg0: memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c4608 = arith.constant 4608 : index
-    %alloc = memref.alloc() : memref<256x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown156 blocks in (%c4608, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<256x256x3x3xf16>, %alloc : memref<256x256x3x3xf32>)
-    return %alloc : memref<256x256x3x3xf32>
-  }
-  func.func private @Unknown157(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown157", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown164(%arg0: memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1152 = arith.constant 1152 : index
     %c1 = arith.constant 1 : index
-    %c9216 = arith.constant 9216 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x3x3xf32>
-    gpu.launch_func  @unified::@Unknown157 blocks in (%c9216, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown164 blocks in (%c1152, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x3x3xf16>, %alloc : memref<512x256x3x3xf32>)
     return %alloc : memref<512x256x3x3xf32>
   }
-  func.func private @Unknown158(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown158", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown165(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown165", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c2304 = arith.constant 2304 : index
     %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown158 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
+    gpu.launch_func  @unified::@Unknown165 blocks in (%c2304, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
     return %alloc : memref<512x512x3x3xf32>
   }
-  func.func private @Unknown159(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+  func.func private @Unknown166(%arg0: memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown166", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
     %c128 = arith.constant 128 : index
     %c1 = arith.constant 1 : index
-    %c1024 = arith.constant 1024 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<512x256x1x1xf32>
-    gpu.launch_func  @unified::@Unknown159 blocks in (%c1024, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>)
+    gpu.launch_func  @unified::@Unknown166 blocks in (%c128, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<512x256x1x1xf16>, %alloc : memref<512x256x1x1xf32>)
     return %alloc : memref<512x256x1x1xf32>
   }
-  func.func private @Unknown160(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown160 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown161(%arg0: memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
-    %c1 = arith.constant 1 : index
-    %c18432 = arith.constant 18432 : index
-    %alloc = memref.alloc() : memref<512x512x3x3xf32>
-    gpu.launch_func  @unified::@Unknown161 blocks in (%c18432, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<512x512x3x3xf16>, %alloc : memref<512x512x3x3xf32>)
-    return %alloc : memref<512x512x3x3xf32>
-  }
-  func.func private @Unknown163(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown163", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown170(%arg0: memref<1000x512xf16>) -> memref<1000x512xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown170", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c500 = arith.constant 500 : index
     %c1 = arith.constant 1 : index
-    %c4000 = arith.constant 4000 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1000x512xf32>
-    gpu.launch_func  @unified::@Unknown163 blocks in (%c4000, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>)
+    gpu.launch_func  @unified::@Unknown170 blocks in (%c500, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000x512xf16>, %alloc : memref<1000x512xf32>)
     return %alloc : memref<1000x512xf32>
   }
-  func.func private @Unknown164(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
-    %c128 = arith.constant 128 : index
+  func.func private @Unknown171(%arg0: memref<4x1000xf16>) -> memref<1000xf32> attributes {__byteir_reduction_fusion__} {
+    %c2 = arith.constant 2 : index
+    %c32 = arith.constant 32 : index
+    %c1000 = arith.constant 1000 : index
+    %c-32 = arith.constant -32 : index
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f16
     %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1000xf32>
+    gpu.launch_func  @unified::@Unknown171_kernel blocks in (%c32, %c1, %c1) threads in (%c32, %c2, %c1)  args(%arg0 : memref<4x1000xf16>, %alloc : memref<1000xf32>)
+    return %alloc : memref<1000xf32>
+  }
+  func.func private @Unknown172(%arg0: memref<1000xf32>) -> memref<1000xf32> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown172", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name} {
+    %c1 = arith.constant 1 : index
+    %c256 = arith.constant 256 : index
     %alloc = memref.alloc() : memref<1000xf32>
-    gpu.launch_func  @unified::@Unknown164 blocks in (%c8, %c1, %c1) threads in (%c128, %c1, %c1) args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>)
+    gpu.launch_func  @unified::@Unknown172 blocks in (%c1, %c1, %c1) threads in (%c256, %c1, %c1)  args(%arg0 : memref<1000xf32>, %alloc : memref<1000xf32>)
     return %alloc : memref<1000xf32>
   }
   func.func @main(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x1000xf32>, %arg2: memref<64x3x7x7xf32>, %arg3: memref<64xf32>, %arg4: memref<64xf32>, %arg5: memref<64xf32>, %arg6: memref<64xf32>, %arg7: memref<64x64x3x3xf32>, %arg8: memref<64xf32>, %arg9: memref<64xf32>, %arg10: memref<64xf32>, %arg11: memref<64xf32>, %arg12: memref<64x64x3x3xf32>, %arg13: memref<64xf32>, %arg14: memref<64xf32>, %arg15: memref<64xf32>, %arg16: memref<64xf32>, %arg17: memref<64x64x3x3xf32>, %arg18: memref<64xf32>, %arg19: memref<64xf32>, %arg20: memref<64xf32>, %arg21: memref<64xf32>, %arg22: memref<64x64x3x3xf32>, %arg23: memref<64xf32>, %arg24: memref<64xf32>, %arg25: memref<64xf32>, %arg26: memref<64xf32>, %arg27: memref<128x64x3x3xf32>, %arg28: memref<128xf32>, %arg29: memref<128xf32>, %arg30: memref<128xf32>, %arg31: memref<128xf32>, %arg32: memref<128x128x3x3xf32>, %arg33: memref<128xf32>, %arg34: memref<128xf32>, %arg35: memref<128xf32>, %arg36: memref<128xf32>, %arg37: memref<128x64x1x1xf32>, %arg38: memref<128xf32>, %arg39: memref<128xf32>, %arg40: memref<128xf32>, %arg41: memref<128xf32>, %arg42: memref<128x128x3x3xf32>, %arg43: memref<128xf32>, %arg44: memref<128xf32>, %arg45: memref<128xf32>, %arg46: memref<128xf32>, %arg47: memref<128x128x3x3xf32>, %arg48: memref<128xf32>, %arg49: memref<128xf32>, %arg50: memref<128xf32>, %arg51: memref<128xf32>, %arg52: memref<256x128x3x3xf32>, %arg53: memref<256xf32>, %arg54: memref<256xf32>, %arg55: memref<256xf32>, %arg56: memref<256xf32>, %arg57: memref<256x256x3x3xf32>, %arg58: memref<256xf32>, %arg59: memref<256xf32>, %arg60: memref<256xf32>, %arg61: memref<256xf32>, %arg62: memref<256x128x1x1xf32>, %arg63: memref<256xf32>, %arg64: memref<256xf32>, %arg65: memref<256xf32>, %arg66: memref<256xf32>, %arg67: memref<256x256x3x3xf32>, %arg68: memref<256xf32>, %arg69: memref<256xf32>, %arg70: memref<256xf32>, %arg71: memref<256xf32>, %arg72: memref<256x256x3x3xf32>, %arg73: memref<256xf32>, %arg74: memref<256xf32>, %arg75: memref<256xf32>, %arg76: memref<256xf32>, %arg77: memref<512x256x3x3xf32>, %arg78: memref<512xf32>, %arg79: memref<512xf32>, %arg80: memref<512xf32>, %arg81: memref<512xf32>, %arg82: memref<512x512x3x3xf32>, %arg83: memref<512xf32>, %arg84: memref<512xf32>, %arg85: memref<512xf32>, %arg86: memref<512xf32>, %arg87: memref<512x256x1x1xf32>, %arg88: memref<512xf32>, %arg89: memref<512xf32>, %arg90: memref<512xf32>, %arg91: memref<512xf32>, %arg92: memref<512x512x3x3xf32>, %arg93: memref<512xf32>, %arg94: memref<512xf32>, %arg95: memref<512xf32>, %arg96: memref<512xf32>, %arg97: memref<512x512x3x3xf32>, %arg98: memref<512xf32>, %arg99: memref<512xf32>, %arg100: memref<512xf32>, %arg101: memref<512xf32>, %arg102: memref<1000x512xf32>, %arg103: memref<1000xf32>) -> (memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>) attributes {__placeholder__byre.entry_point} {
@@ -4678,344 +2793,340 @@ module @IrToMhlo.2452 attributes {gpu.container_module} {
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16>
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x112x112xf16>
     %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
-    %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
+    %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32>) -> memref<64x64x3x3xf16>
     %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32>) -> memref<128x64x1x1xf16>
     %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32>) -> memref<128x64x3x3xf16>
     %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
-    %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
+    %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32>) -> memref<128x128x3x3xf16>
     %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32>) -> memref<256x128x1x1xf16>
     %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32>) -> memref<256x128x3x3xf16>
     %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
-    %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
+    %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32>) -> memref<256x256x3x3xf16>
     %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32>) -> memref<512x256x1x1xf16>
     %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32>) -> memref<512x256x3x3xf16>
     %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
-    %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
+    %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32>) -> memref<512x512x3x3xf16>
     %21 = call @Unknown22(%arg1) : (memref<4x1000xf32>) -> memref<4x1000xf16>
     %22 = call @Unknown23(%arg102) : (memref<1000x512xf32>) -> memref<1000x512xf16>
-    %alloc_1 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %23 = call @Unknown24(%arg103) : (memref<1000xf32>) -> memref<1000xf16>
+    %24 = call @Unknown25(%21) : (memref<4x1000xf16>) -> memref<4xf16>
+    %25:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16>) -> (memref<4x64x112x112xf16>, memref<4x64x112x112xi1>)
+    %alloc_1 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @PoolMaxOp_f16_f16(%25#0, %alloc_1) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
     %alloc_2 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>
+    byre.compute @ConvOp_f16f16_f16(%alloc_1, %2, %alloc_2) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_3 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_2, %arg8, %arg9, %alloc_3) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %26:2 = call @Unknown28(%alloc_3) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_4 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%26#0, %3, %alloc_4) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_5 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_4, %arg13, %arg14, %alloc_5) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %27:2 = call @Unknown30(%alloc_5, %alloc_1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_6 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%27#0, %4, %alloc_6) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_7 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_6, %arg18, %arg19, %alloc_7) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %28:2 = call @Unknown28(%alloc_7) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
     %alloc_8 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @ConvOp_f16f16_f16(%28#0, %5, %alloc_8) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
     %alloc_9 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_10 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
-    %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_8, %arg23, %arg24, %alloc_9) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>, memref<4x64x56x56xf16>
+    %29:2 = call @Unknown30(%alloc_9, %27#0) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> (memref<4x64x56x56xf16>, memref<4x64x56x56xi1>)
+    %alloc_10 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %6, %alloc_10) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
     %alloc_11 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x1x1xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_10, %arg38, %arg39, %alloc_11) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
     %alloc_12 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    byre.compute @ConvOp_f16f16_f16(%29#0, %7, %alloc_12) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_13 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<128x64x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_12, %arg28, %arg29, %alloc_13) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %30:2 = call @Unknown37(%alloc_13) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_14 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%30#0, %8, %alloc_14) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_15 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_14, %arg33, %arg34, %alloc_15) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %31:2 = call @Unknown39(%alloc_15, %alloc_11) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_16 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%31#0, %9, %alloc_16) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_17 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_16, %arg43, %arg44, %alloc_17) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %32:2 = call @Unknown37(%alloc_17) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
     %alloc_18 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @ConvOp_f16f16_f16(%32#0, %10, %alloc_18) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
     %alloc_19 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_20 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
-    %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_18, %arg48, %arg49, %alloc_19) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>, memref<4x128x28x28xf16>
+    %33:2 = call @Unknown39(%alloc_19, %31#0) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>) -> (memref<4x128x28x28xf16>, memref<4x128x28x28xi1>)
+    %alloc_20 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %11, %alloc_20) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
     %alloc_21 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x1x1xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_20, %arg63, %arg64, %alloc_21) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
     %alloc_22 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    byre.compute @ConvOp_f16f16_f16(%33#0, %12, %alloc_22) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_23 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<256x128x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_22, %arg53, %arg54, %alloc_23) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %34:2 = call @Unknown46(%alloc_23) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_24 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%34#0, %13, %alloc_24) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_25 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_24, %arg58, %arg59, %alloc_25) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %35:2 = call @Unknown48(%alloc_25, %alloc_21) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_26 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%35#0, %14, %alloc_26) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_27 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_26, %arg68, %arg69, %alloc_27) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %36:2 = call @Unknown46(%alloc_27) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
     %alloc_28 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @ConvOp_f16f16_f16(%36#0, %15, %alloc_28) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
     %alloc_29 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_30 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
-    %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_28, %arg73, %arg74, %alloc_29) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>, memref<4x256x14x14xf16>
+    %37:2 = call @Unknown48(%alloc_29, %35#0) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>) -> (memref<4x256x14x14xf16>, memref<4x256x14x14xi1>)
+    %alloc_30 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %16, %alloc_30) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
     %alloc_31 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x1x1xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_30, %arg88, %arg89, %alloc_31) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
     %alloc_32 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    byre.compute @ConvOp_f16f16_f16(%37#0, %17, %alloc_32) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_33 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<512x256x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_32, %arg78, %arg79, %alloc_33) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %38:2 = call @Unknown55(%alloc_33) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_34 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%38#0, %18, %alloc_34) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_35 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_34, %arg83, %arg84, %alloc_35) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %39:2 = call @Unknown57(%alloc_35, %alloc_31) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_36 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%39#0, %19, %alloc_36) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_37 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_36, %arg93, %arg94, %alloc_37) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %40:2 = call @Unknown55(%alloc_37) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
     %alloc_38 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @ConvOp_f16f16_f16(%40#0, %20, %alloc_38) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
     %alloc_39 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_40 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
-    %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_38, %arg98, %arg99, %alloc_39) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>, memref<4x512x7x7xf16>
+    %41:2 = call @Unknown57(%alloc_39, %39#0) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>) -> (memref<4x512x7x7xf16>, memref<4x512x7x7xi1>)
+    %42 = call @Unknown62(%41#0) : (memref<4x512x7x7xf16>) -> memref<4x512xf16>
+    %43 = call @Unknown63(%42) : (memref<4x512xf16>) -> memref<4x512xf16>
+    %alloc_40 = memref.alloc() : memref<4x1000xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %22, %alloc_40) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
+    %44 = call @Unknown64(%23, %alloc_40) : (memref<1000xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %45 = call @Unknown65(%44) : (memref<4x1000xf16>) -> memref<4xf16>
+    %46 = call @Unknown66(%45, %44) : (memref<4xf16>, memref<4x1000xf16>) -> memref<4x1000xf16>
+    %47 = call @Unknown67(%46) : (memref<4x1000xf16>) -> memref<4xf16>
+    %48 = call @Unknown68(%47) : (memref<4xf16>) -> memref<4xf16>
+    %49:2 = call @Unknown69(%48, %46, %24, %21) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
     %alloc_41 = memref.alloc() : memref<4x512xf16>
-    byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<4x512xf16>
-    %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16>) -> memref<4x512xf16>
-    %alloc_42 = memref.alloc() : memref<4x1000xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16>, memref<1000x512xf16>, memref<4x1000xf16>
-    %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32>, memref<4x1000xf16>) -> memref<4x1000xf16>
-    %alloc_43 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16>, memref<4x1000xf16>) -> (memref<4x1000xf16>, memref<4x1000xf16>)
-    %alloc_44 = memref.alloc() : memref<4xf16>
-    byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16>, memref<4xf16>
-    %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16>, memref<4x1000xf16>, memref<4xf16>, memref<4x1000xf16>, memref<4x1000xf32>) -> (memref<4x1000xf16>, memref<4x1000xf32>, memref<4x1000xf32>)
-    %alloc_45 = memref.alloc() : memref<4x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
-    %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_46 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_47 = memref.alloc() : memref<512xf32>
+    byre.compute @MatmulOp_f16f16_f16(%49#1, %22, %alloc_41) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16>, memref<1000x512xf16>, memref<4x512xf16>
+    %50 = call @Unknown70(%alloc_41, %41#1) : (memref<4x512xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_42 = memref.alloc() : memref<4x512x7x7xf16>
+    %alloc_43 = memref.alloc() : memref<512xf32>
+    %alloc_44 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_38, %arg98, %50, %alloc_42, %alloc_43, %alloc_44) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_45 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_42, %20, %alloc_45) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_46 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%40#0, %alloc_42, %alloc_46) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %51 = call @Unknown74(%40#1, %alloc_45) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_47 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_48 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_49 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_50 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_51 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_52 = memref.alloc() : memref<512xf32>
+    %alloc_49 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_36, %arg93, %51, %alloc_47, %alloc_48, %alloc_49) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_50 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_47, %19, %alloc_50) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_51 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_47, %alloc_51) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %52 = call @Unknown78(%50, %alloc_50, %39#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
+    %alloc_52 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_53 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_54 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_55 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<4x512x7x7xi1>) -> memref<4x512x7x7xf16>
-    %alloc_56 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_57 = memref.alloc() : memref<512xf32>
+    %alloc_54 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_34, %arg83, %52, %alloc_52, %alloc_53, %alloc_54) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_55 = memref.alloc() : memref<4x512x7x7xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_52, %18, %alloc_55) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
+    %alloc_56 = memref.alloc() : memref<512x512x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_52, %alloc_56) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
+    %53 = call @Unknown74(%38#1, %alloc_55) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
+    %alloc_57 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_58 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_59 = memref.alloc() : memref<4x512x7x7xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x512x3x3xf16>, memref<4x512x7x7xf16>
-    %alloc_60 = memref.alloc() : memref<512x512x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512x512x3x3xf16>
-    %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1>, memref<4x512x7x7xf16>) -> memref<4x512x7x7xf16>
-    %alloc_61 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_62 = memref.alloc() : memref<512xf32>
+    %alloc_59 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_32, %arg78, %53, %alloc_57, %alloc_58, %alloc_59) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_60 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_57, %17, %alloc_60) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_61 = memref.alloc() : memref<512x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_57, %alloc_61) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
+    %alloc_62 = memref.alloc() : memref<4x512x7x7xf16>
     %alloc_63 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_64 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_65 = memref.alloc() : memref<512x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x3x3xf16>
-    %alloc_66 = memref.alloc() : memref<4x512x7x7xf16>
-    %alloc_67 = memref.alloc() : memref<512xf32>
-    %alloc_68 = memref.alloc() : memref<512xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
-    %alloc_69 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
-    %alloc_70 = memref.alloc() : memref<512x256x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
-    %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_71 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_72 = memref.alloc() : memref<256xf32>
+    %alloc_64 = memref.alloc() : memref<512xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_30, %arg88, %52, %alloc_62, %alloc_63, %alloc_64) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16>, memref<512xf32>, memref<4x512x7x7xf16>, memref<4x512x7x7xf16>, memref<512xf32>, memref<512xf32>
+    %alloc_65 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_62, %16, %alloc_65) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16>, memref<512x256x1x1xf16>, memref<4x256x14x14xf16>
+    %alloc_66 = memref.alloc() : memref<512x256x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_62, %alloc_66) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x512x7x7xf16>, memref<512x256x1x1xf16>
+    %54 = call @Unknown89(%alloc_65, %alloc_60, %37#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_67 = memref.alloc() : memref<4x256x14x14xf16>
+    %alloc_68 = memref.alloc() : memref<256xf32>
+    %alloc_69 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_28, %arg73, %54, %alloc_67, %alloc_68, %alloc_69) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_70 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_67, %15, %alloc_70) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_71 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_67, %alloc_71) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %55 = call @Unknown93(%36#1, %alloc_70) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_72 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_73 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_74 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_75 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_76 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_77 = memref.alloc() : memref<256xf32>
+    %alloc_74 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_26, %arg68, %55, %alloc_72, %alloc_73, %alloc_74) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_75 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_72, %14, %alloc_75) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_76 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_72, %alloc_76) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %56 = call @Unknown89(%54, %alloc_75, %35#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
+    %alloc_77 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_78 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_79 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_80 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<4x256x14x14xi1>) -> memref<4x256x14x14xf16>
-    %alloc_81 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_82 = memref.alloc() : memref<256xf32>
+    %alloc_79 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_24, %arg58, %56, %alloc_77, %alloc_78, %alloc_79) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_80 = memref.alloc() : memref<4x256x14x14xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_77, %13, %alloc_80) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
+    %alloc_81 = memref.alloc() : memref<256x256x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_77, %alloc_81) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
+    %57 = call @Unknown93(%34#1, %alloc_80) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
+    %alloc_82 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_83 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_84 = memref.alloc() : memref<4x256x14x14xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x256x3x3xf16>, memref<4x256x14x14xf16>
-    %alloc_85 = memref.alloc() : memref<256x256x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256x256x3x3xf16>
-    %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1>, memref<4x256x14x14xf16>) -> memref<4x256x14x14xf16>
-    %alloc_86 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_87 = memref.alloc() : memref<256xf32>
+    %alloc_84 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_22, %arg53, %57, %alloc_82, %alloc_83, %alloc_84) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_85 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_82, %12, %alloc_85) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_86 = memref.alloc() : memref<256x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_82, %alloc_86) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
+    %alloc_87 = memref.alloc() : memref<4x256x14x14xf16>
     %alloc_88 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_89 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_90 = memref.alloc() : memref<256x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x3x3xf16>
-    %alloc_91 = memref.alloc() : memref<4x256x14x14xf16>
-    %alloc_92 = memref.alloc() : memref<256xf32>
-    %alloc_93 = memref.alloc() : memref<256xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
-    %alloc_94 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
-    %alloc_95 = memref.alloc() : memref<256x128x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
-    %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_96 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_97 = memref.alloc() : memref<128xf32>
+    %alloc_89 = memref.alloc() : memref<256xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_20, %arg63, %56, %alloc_87, %alloc_88, %alloc_89) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16>, memref<256xf32>, memref<4x256x14x14xf16>, memref<4x256x14x14xf16>, memref<256xf32>, memref<256xf32>
+    %alloc_90 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_87, %11, %alloc_90) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16>, memref<256x128x1x1xf16>, memref<4x128x28x28xf16>
+    %alloc_91 = memref.alloc() : memref<256x128x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_87, %alloc_91) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x256x14x14xf16>, memref<256x128x1x1xf16>
+    %58 = call @Unknown108(%alloc_90, %alloc_85, %33#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_92 = memref.alloc() : memref<4x128x28x28xf16>
+    %alloc_93 = memref.alloc() : memref<128xf32>
+    %alloc_94 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_18, %arg48, %58, %alloc_92, %alloc_93, %alloc_94) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_95 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_92, %10, %alloc_95) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_96 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_92, %alloc_96) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %59 = call @Unknown112(%32#1, %alloc_95) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_97 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_98 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_99 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_100 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_101 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_102 = memref.alloc() : memref<128xf32>
+    %alloc_99 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_16, %arg43, %59, %alloc_97, %alloc_98, %alloc_99) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_100 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_97, %9, %alloc_100) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_101 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_97, %alloc_101) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %60 = call @Unknown108(%58, %alloc_100, %31#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
+    %alloc_102 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_103 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_104 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_105 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<4x128x28x28xi1>) -> memref<4x128x28x28xf16>
-    %alloc_106 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_107 = memref.alloc() : memref<128xf32>
+    %alloc_104 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_14, %arg33, %60, %alloc_102, %alloc_103, %alloc_104) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_105 = memref.alloc() : memref<4x128x28x28xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_102, %8, %alloc_105) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
+    %alloc_106 = memref.alloc() : memref<128x128x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_102, %alloc_106) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
+    %61 = call @Unknown112(%30#1, %alloc_105) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
+    %alloc_107 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_108 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_109 = memref.alloc() : memref<4x128x28x28xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x128x3x3xf16>, memref<4x128x28x28xf16>
-    %alloc_110 = memref.alloc() : memref<128x128x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128x128x3x3xf16>
-    %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1>, memref<4x128x28x28xf16>) -> memref<4x128x28x28xf16>
-    %alloc_111 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_112 = memref.alloc() : memref<128xf32>
+    %alloc_109 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_12, %arg28, %61, %alloc_107, %alloc_108, %alloc_109) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_110 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_107, %7, %alloc_110) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_111 = memref.alloc() : memref<128x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_107, %alloc_111) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
+    %alloc_112 = memref.alloc() : memref<4x128x28x28xf16>
     %alloc_113 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_114 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_115 = memref.alloc() : memref<128x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x3x3xf16>
-    %alloc_116 = memref.alloc() : memref<4x128x28x28xf16>
-    %alloc_117 = memref.alloc() : memref<128xf32>
-    %alloc_118 = memref.alloc() : memref<128xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
-    %alloc_119 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
-    %alloc_120 = memref.alloc() : memref<128x64x1x1xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
-    %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_121 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_122 = memref.alloc() : memref<64xf32>
+    %alloc_114 = memref.alloc() : memref<128xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_10, %arg38, %60, %alloc_112, %alloc_113, %alloc_114) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16>, memref<128xf32>, memref<4x128x28x28xf16>, memref<4x128x28x28xf16>, memref<128xf32>, memref<128xf32>
+    %alloc_115 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_112, %6, %alloc_115) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16>, memref<128x64x1x1xf16>, memref<4x64x56x56xf16>
+    %alloc_116 = memref.alloc() : memref<128x64x1x1xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_112, %alloc_116) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x128x28x28xf16>, memref<128x64x1x1xf16>
+    %62 = call @Unknown127(%alloc_115, %alloc_110, %29#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_117 = memref.alloc() : memref<4x64x56x56xf16>
+    %alloc_118 = memref.alloc() : memref<64xf32>
+    %alloc_119 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_8, %arg23, %62, %alloc_117, %alloc_118, %alloc_119) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_120 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_117, %5, %alloc_120) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_121 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_117, %alloc_121) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %63 = call @Unknown131(%28#1, %alloc_120) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_122 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_123 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_124 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_125 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_126 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_127 = memref.alloc() : memref<64xf32>
+    %alloc_124 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_6, %arg18, %63, %alloc_122, %alloc_123, %alloc_124) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_125 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_122, %4, %alloc_125) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_126 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_122, %alloc_126) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %64 = call @Unknown127(%62, %alloc_125, %27#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
+    %alloc_127 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_128 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_129 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_130 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<4x64x56x56xi1>) -> memref<4x64x56x56xf16>
-    %alloc_131 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_132 = memref.alloc() : memref<64xf32>
+    %alloc_129 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_4, %arg13, %64, %alloc_127, %alloc_128, %alloc_129) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_130 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_127, %3, %alloc_130) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_131 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_127, %alloc_131) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %65 = call @Unknown131(%26#1, %alloc_130) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_132 = memref.alloc() : memref<4x64x56x56xf16>
     %alloc_133 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_134 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_135 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_136 = memref.alloc() : memref<4x64x56x56xf16>
-    %alloc_137 = memref.alloc() : memref<64xf32>
-    %alloc_138 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_139 = memref.alloc() : memref<4x64x56x56xf16>
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
-    %alloc_140 = memref.alloc() : memref<64x64x3x3xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
-    %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
-    %alloc_141 = memref.alloc() : memref<4x64x112x112xf16>
-    byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
-    %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
-    %alloc_142 = memref.alloc() : memref<4x64x112x112xf16>
-    %alloc_143 = memref.alloc() : memref<64xf32>
-    %alloc_144 = memref.alloc() : memref<64xf32>
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
-    %alloc_145 = memref.alloc() : memref<64x3x7x7xf16>
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
-    %alloc_146 = memref.alloc() : memref<f32>
-    byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<f32>
-    %62 = call @Unknown141(%alloc_146) : (memref<f32>) -> memref<f32>
-    %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
-    %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
-    %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
-    %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
-    %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
-    %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
-    %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
-    %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
-    %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
-    %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
-    %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
-    %alloc_147 = memref.alloc() : memref<1000x512xf16>
-    byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
-    %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16>) -> memref<1000x512xf32>
-    %alloc_148 = memref.alloc() : memref<1000xf32>
-    byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32>, memref<1000xf32>
-    %84 = call @Unknown164(%alloc_148) : (memref<1000xf32>) -> memref<1000xf32>
-    return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
+    %alloc_134 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_2, %arg8, %65, %alloc_132, %alloc_133, %alloc_134) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16>, memref<64xf32>, memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_135 = memref.alloc() : memref<4x64x56x56xf16>
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_132, %2, %alloc_135) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<64x64x3x3xf16>, memref<4x64x56x56xf16>
+    %alloc_136 = memref.alloc() : memref<64x64x3x3xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_1, %alloc_132, %alloc_136) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16>, memref<4x64x56x56xf16>, memref<64x64x3x3xf16>
+    %66 = call @Unknown143(%64, %alloc_135) : (memref<4x64x56x56xf16>, memref<4x64x56x56xf16>) -> memref<4x64x56x56xf16>
+    %alloc_137 = memref.alloc() : memref<4x64x112x112xf16>
+    byre.compute @PoolMaxGradOp_f16f16_f16(%25#0, %66, %alloc_137) {memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16>, memref<4x64x56x56xf16>, memref<4x64x112x112xf16>
+    %67 = call @Unknown144(%25#1, %alloc_137) : (memref<4x64x112x112xi1>, memref<4x64x112x112xf16>) -> memref<4x64x112x112xf16>
+    %alloc_138 = memref.alloc() : memref<4x64x112x112xf16>
+    %alloc_139 = memref.alloc() : memref<64xf32>
+    %alloc_140 = memref.alloc() : memref<64xf32>
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %67, %alloc_138, %alloc_139, %alloc_140) {epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16>, memref<64xf32>, memref<4x64x112x112xf16>, memref<4x64x112x112xf16>, memref<64xf32>, memref<64xf32>
+    %alloc_141 = memref.alloc() : memref<64x3x7x7xf16>
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_138, %alloc_141) {batch_group_count = 1 : i64, feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16>, memref<4x64x112x112xf16>, memref<64x3x7x7xf16>
+    %68 = call @Unknown147(%49#0, %arg1) : (memref<4x1000xf16>, memref<4x1000xf32>) -> memref<f32>
+    %69 = call @Unknown148(%68) : (memref<f32>) -> memref<f32>
+    %70 = call @Unknown149(%alloc_141) : (memref<64x3x7x7xf16>) -> memref<64x3x7x7xf32>
+    %71 = call @Unknown150(%alloc_136) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %72 = call @Unknown150(%alloc_131) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %73 = call @Unknown150(%alloc_126) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %74 = call @Unknown150(%alloc_121) : (memref<64x64x3x3xf16>) -> memref<64x64x3x3xf32>
+    %75 = call @Unknown154(%alloc_111) : (memref<128x64x3x3xf16>) -> memref<128x64x3x3xf32>
+    %76 = call @Unknown155(%alloc_106) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %77 = call @Unknown156(%alloc_116) : (memref<128x64x1x1xf16>) -> memref<128x64x1x1xf32>
+    %78 = call @Unknown155(%alloc_101) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %79 = call @Unknown155(%alloc_96) : (memref<128x128x3x3xf16>) -> memref<128x128x3x3xf32>
+    %80 = call @Unknown159(%alloc_86) : (memref<256x128x3x3xf16>) -> memref<256x128x3x3xf32>
+    %81 = call @Unknown160(%alloc_81) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %82 = call @Unknown161(%alloc_91) : (memref<256x128x1x1xf16>) -> memref<256x128x1x1xf32>
+    %83 = call @Unknown160(%alloc_76) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %84 = call @Unknown160(%alloc_71) : (memref<256x256x3x3xf16>) -> memref<256x256x3x3xf32>
+    %85 = call @Unknown164(%alloc_61) : (memref<512x256x3x3xf16>) -> memref<512x256x3x3xf32>
+    %86 = call @Unknown165(%alloc_56) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %87 = call @Unknown166(%alloc_66) : (memref<512x256x1x1xf16>) -> memref<512x256x1x1xf32>
+    %88 = call @Unknown165(%alloc_51) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %89 = call @Unknown165(%alloc_46) : (memref<512x512x3x3xf16>) -> memref<512x512x3x3xf32>
+    %alloc_142 = memref.alloc() : memref<1000x512xf16>
+    byre.compute @MatmulOp_f16f16_f16(%43, %49#1, %alloc_142) {lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16>, memref<4x1000xf16>, memref<1000x512xf16>
+    %90 = call @Unknown170(%alloc_142) : (memref<1000x512xf16>) -> memref<1000x512xf32>
+    %91 = call @Unknown171(%49#1) : (memref<4x1000xf16>) -> memref<1000xf32>
+    %92 = call @Unknown172(%91) : (memref<1000xf32>) -> memref<1000xf32>
+    return %69, %70, %alloc_139, %alloc_140, %71, %alloc_133, %alloc_134, %72, %alloc_128, %alloc_129, %73, %alloc_123, %alloc_124, %74, %alloc_118, %alloc_119, %75, %alloc_108, %alloc_109, %76, %alloc_103, %alloc_104, %77, %alloc_113, %alloc_114, %78, %alloc_98, %alloc_99, %79, %alloc_93, %alloc_94, %80, %alloc_83, %alloc_84, %81, %alloc_78, %alloc_79, %82, %alloc_88, %alloc_89, %83, %alloc_73, %alloc_74, %84, %alloc_68, %alloc_69, %85, %alloc_58, %alloc_59, %86, %alloc_53, %alloc_54, %87, %alloc_63, %alloc_64, %88, %alloc_48, %alloc_49, %89, %alloc_43, %alloc_44, %90, %92 : memref<f32>, memref<64x3x7x7xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<64x64x3x3xf32>, memref<64xf32>, memref<64xf32>, memref<128x64x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x64x1x1xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<128x128x3x3xf32>, memref<128xf32>, memref<128xf32>, memref<256x128x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x128x1x1xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<256x256x3x3xf32>, memref<256xf32>, memref<256xf32>, memref<512x256x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x256x1x1xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<512x512x3x3xf32>, memref<512xf32>, memref<512xf32>, memref<1000x512xf32>, memref<1000xf32>
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir b/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir
index 551a80c98..2da0299c3 100644
--- a/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/8_byre_opt.mlir
@@ -4,4058 +4,2191 @@
 
 module @IrToMhlo.2452 attributes {gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
+    gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
       %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        %9 = arith.extf %8 : f16 to f32
+        memref.store %9, %arg1[%arg2] : memref<1000xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
+      %c131072 = arith.constant 131072 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
+    gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
+      %c32768 = arith.constant 32768 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
+      %c128 = arith.constant 128 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+    gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
+      %c8192 = arith.constant 8192 : index
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
+    gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
+      %c73728 = arith.constant 73728 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
+    gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
+      %c36864 = arith.constant 36864 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
+    gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown148(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 4.000000e+00 : f32
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1 step %6 {
+        %7 = memref.load %arg0[] : memref<f32>
+        %8 = arith.negf %7 : f32
+        %9 = arith.divf %8, %cst : f32
+        memref.store %9, %arg1[] : memref<f32>
       }
       gpu.return
     }
-    gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
+    gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+      %c3211264 = arith.constant 3211264 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown141(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
-      %cst = arith.constant 4.000000e+00 : f32
-      %c1 = arith.constant 1 : index
+    gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[] : memref<f32>
-        %7 = arith.negf %6 : f32
-        %8 = arith.divf %7, %cst : f32
-        memref.store %8, %arg1[] : memref<f32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+    gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11] : memref<4x512xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %15 = arith.divf %13, %cst : f16
+        %16 = arith.select %14, %15, %cst_0 : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg6 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg6, %c1000 : index
+        %8 = arith.divsi %arg6, %c1000 : index
+        %9 = memref.load %arg2[%8] : memref<4xf16>
+        %10 = memref.load %arg0[%8] : memref<4xf16>
+        %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16>
+        %13 = arith.subf %11, %10 : f16
+        %14 = math.exp %13 : f16
+        %15 = arith.mulf %14, %9 : f16
+        %16 = arith.subf %12, %15 : f16
+        memref.store %13, %arg4[%8, %7] : memref<4x1000xf16>
+        memref.store %16, %arg5[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel {
+      %c4 = arith.constant 4 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<4xf16>
+        %8 = math.log %7 : f16
+        memref.store %8, %arg1[%arg2] : memref<4xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%8] : memref<4xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.subf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%7] : memref<1000xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.addf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
+      %c2048 = arith.constant 2048 : index
+      %cst = arith.constant 2.040100e-02 : f16
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2048 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x512xf16>
+        %10 = arith.mulf %9, %cst : f16
+        memref.store %10, %arg1[%8, %7] : memref<4x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
       %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29] : memref<4x512xf16>
-        %38 = arith.divf %37, %cst_0 : f16
-        %39 = arith.select %36, %38, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %18 = memref.load %arg0[%15] : memref<4xf16>
-        %19 = memref.load %arg2[%15] : memref<4xf16>
-        %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32>
-        %21 = math.log %18 : f16
-        %22 = arith.subf %17, %21 : f16
-        %23 = math.exp %22 : f16
-        %24 = arith.mulf %23, %19 : f16
-        %25 = arith.subf %16, %24 : f16
-        %26 = arith.extf %22 : f16 to f32
-        %27 = arith.mulf %26, %20 : f32
-        %28 = arith.extf %25 : f16 to f32
-        memref.store %25, %arg5[%15, %9] : memref<4x1000xf16>
-        memref.store %27, %arg6[%15, %9] : memref<4x1000xf32>
-        memref.store %28, %arg7[%15, %9] : memref<4x1000xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%15] : memref<4xf16>
-        %18 = arith.subf %16, %17 : f16
-        %19 = math.exp %18 : f16
-        memref.store %18, %arg2[%15, %9] : memref<4x1000xf16>
-        memref.store %19, %arg3[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%9] : memref<1000xf32>
-        %18 = arith.truncf %17 : f32 to f16
-        %19 = arith.addf %16, %18 : f16
-        memref.store %19, %arg2[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c2048 = arith.constant 2048 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2048 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x512xf16>
-        %17 = arith.mulf %16, %cst : f16
-        memref.store %17, %arg1[%15, %9] : memref<4x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+    gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
+      %c3211264 = arith.constant 3211264 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel {
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        memref.store %8, %arg1[%arg2] : memref<1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel {
-      %cst = arith.constant -2.500000e-01 : f32
-      %c0 = arith.constant 0 : index
       %c4000 = arith.constant 4000 : index
+      %cst = arith.constant -2.500000e-01 : f32
       %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32>
-        %17 = arith.mulf %16, %cst : f32
-        %18 = arith.truncf %17 : f32 to f16
-        memref.store %18, %arg1[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg2, %c1000 : index
+        %8 = arith.divsi %arg2, %c1000 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32>
+        %10 = arith.mulf %9, %cst : f32
+        %11 = arith.truncf %10 : f32 to f16
+        memref.store %11, %arg1[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
+      %c602112 = arith.constant 602112 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c224 = arith.constant 224 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c602112 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.addf %18, %cst : f16
+      %20 = arith.cmpi ugt, %16, %c1 : index
+      %21 = scf.if %20 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %22 = arith.addf %19, %21 : f16
+      memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c256 : index
+      scf.if %23 {
+        %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c128 : index
+      scf.if %24 {
+        %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c64 : index
+      scf.if %25 {
+        %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c32 : index
+      scf.if %26 {
+        %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c16 : index
+      scf.if %27 {
+        %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c8 : index
+      scf.if %28 {
+        %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c4 : index
+      scf.if %29 {
+        %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c2 : index
+      scf.if %30 {
+        %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %31 = arith.cmpi ult, %1, %c1 : index
+      scf.if %31 {
+        %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 2048, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<2048xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c602112 = arith.constant 602112 : index
-      %c224 = arith.constant 224 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.cmpi ugt, %16, %c1 : index
+      %20 = scf.if %19 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %21 = arith.maximumf %18, %20 : f16
+      memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c256 : index
+      scf.if %22 {
+        %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c128 : index
+      scf.if %23 {
+        %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c64 : index
+      scf.if %24 {
+        %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c32 : index
+      scf.if %25 {
+        %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c16 : index
+      scf.if %26 {
+        %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c8 : index
+      scf.if %27 {
+        %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c4 : index
+      scf.if %28 {
+        %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c2 : index
+      scf.if %29 {
+        %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %30 = arith.cmpi ult, %1, %c1 : index
+      scf.if %30 {
+        %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = math.exp %18 : f16
+      %20 = arith.addf %19, %cst : f16
+      %21 = arith.cmpi ugt, %16, %c1 : index
+      %22 = scf.if %21 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %23 = math.exp %22 : f16
+      %24 = arith.addf %20, %23 : f16
+      memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c256 : index
+      scf.if %25 {
+        %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c128 : index
+      scf.if %26 {
+        %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c64 : index
+      scf.if %27 {
+        %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c32 : index
+      scf.if %28 {
+        %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c16 : index
+      scf.if %29 {
+        %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c8 : index
+      scf.if %30 {
+        %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %1, %c4 : index
+      scf.if %31 {
+        %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %32 = arith.cmpi ult, %1, %c2 : index
+      scf.if %32 {
+        %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %33 = arith.cmpi ult, %1, %c1 : index
+      scf.if %33 {
+        %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c2 = arith.constant 2 : index
+      %c64 = arith.constant 64 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c125 = arith.constant 125 : index
+      %c0 = arith.constant 0 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c602112 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16>
-      }
+      %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c128 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c128 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c125 : index
+      %7 = arith.select %6, %5, %c125 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c125 : index
+      %10 = arith.select %9, %8, %c125 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13:2 = scf.if %12 -> (f16, f32) {
+        %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+        scf.yield %24, %25 : f16, f32
+      } else {
+        scf.yield %cst_0, %cst : f16, f32
+      }
+      %14 = arith.extf %13#0 : f16 to f32
+      %15 = arith.mulf %14, %13#1 : f32
+      %16 = arith.addf %15, %cst : f32
+      memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c64 : index
+      scf.if %17 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c32 : index
+      scf.if %18 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c16 : index
+      scf.if %19 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %20 = arith.cmpi ult, %1, %c8 : index
+      scf.if %20 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %21 = arith.cmpi ult, %1, %c4 : index
+      scf.if %21 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c2 : index
+      scf.if %22 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %23 = arith.cmpi ult, %1, %c1 : index
+      scf.if %23 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %arg2[%0] : memref<32xf32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref<f32>) kernel attributes {gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %c32 = arith.constant 32 : index
+      %0 = gpu.block_id  x
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %0, %c32 : index
+      %3 = arith.addi %2, %1 : index
+      %4 = memref.load %arg0[%3] : memref<32xf32>
+      %5 = arith.addf %4, %cst : f32
+      memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %6 = arith.cmpi ult, %1, %c16 : index
+      scf.if %6 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %7 = arith.cmpi ult, %1, %c8 : index
+      scf.if %7 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %8 = arith.cmpi ult, %1, %c4 : index
+      scf.if %8 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %9 = arith.cmpi ult, %1, %c2 : index
+      scf.if %9 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %10 = arith.cmpi ult, %1, %c1 : index
+      scf.if %10 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %arg1[] : memref<f32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array<i32: 32, 2, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c2 = arith.constant 2 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c32 = arith.constant 32 : index
+      %c1000 = arith.constant 1000 : index
+      %c-32 = arith.constant -32 : index
+      %0 = gpu.block_id  x
+      %1 = arith.muli %0, %c-32 : index
+      %2 = arith.addi %1, %c1000 : index
+      %3 = arith.cmpi slt, %2, %c32 : index
+      %4 = arith.select %3, %2, %c32 : index
+      %5 = arith.muli %0, %c32 : index
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      %6 = gpu.thread_id  x
+      %7 = gpu.thread_id  y
+      %8 = arith.cmpi slt, %4, %6 : index
+      %9 = arith.select %8, %4, %6 : index
+      %10 = arith.addi %6, %c1 : index
+      %11 = arith.cmpi slt, %4, %10 : index
+      %12 = arith.select %11, %4, %10 : index
+      %13 = arith.subi %12, %9 : index
+      %14 = arith.cmpi ugt, %13, %c0 : index
+      %15 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %5, %9 : index
+        %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16>
+        scf.yield %24 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %16 = arith.extf %15 : f16 to f32
+      %17 = arith.addf %16, %cst : f32
+      %18 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %22, %c1 : index
+        %24 = arith.addi %5, %9 : index
+        %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16>
+        scf.yield %25 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %19 = arith.extf %18 : f16 to f32
+      %20 = arith.addf %17, %19 : f32
+      memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %21 = arith.cmpi ult, %7, %c1 : index
+      scf.if %21 {
+        %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f32
+        %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %23 : f32
+        memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
       gpu.return
     }
   }
-  func.func private @Unknown0(memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4704 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown3(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown4(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown4", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown5(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown5", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown6(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown6", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown7(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown8(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown9(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown10(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown10", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown11(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown11", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown12(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown13(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown14(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown15(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown15", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown16(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown16", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown17(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown18(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown19(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown20(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown20", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown21(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown21", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown22(memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown23(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown24(memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown26(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown28(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown30(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown32(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown32", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown35(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown35", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown37(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown39(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown41(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown41", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown44(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown44", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown46(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown48(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown50(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown50", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown53(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown53", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown55(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown57(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown59(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown59", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown60(memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 16 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown60", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown61(memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown61", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown62(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown62", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown63(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">) attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32, 6 : i32, 7 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown64(memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown68(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown72(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown72", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown76(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown76", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown83(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown83", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown87(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown87", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown91(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown91", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown95(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1568 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown95", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown102(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown102", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown106(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown106", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown110(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown110", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown114(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown114", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown121(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown121", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown125(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown125", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown129(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown129", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown133(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown133", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown137(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 6272 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown137", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown138(memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 25088 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown138", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown141(memref<f32, "cuda">) -> memref<f32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown141", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown142(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 74 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown142", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown143(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown144(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown145(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown145", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown146(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown146", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown147(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown147", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown148(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown149(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 64 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown150(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown151(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown151", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown152(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown152", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown153(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown153", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown154(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 256 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown155(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown156(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4608 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown157(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 9216 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown157", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown158(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown158", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown159(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 1024 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown160(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown161(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 18432 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown163(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 4000 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown163", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
-  func.func private @Unknown164(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 128 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown0(memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 588 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown0", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown1(memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown1", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown3(memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown3", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown7(memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown7", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown8(memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown8", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown9(memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown9", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown12(memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown12", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown13(memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown13", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown14(memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown14", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown17(memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown17", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown18(memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown18", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown19(memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown19", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown22(memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown22", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown23(memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown23", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown24(memref<1000xf32, "cuda">) -> memref<1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown24", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown26(memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown26", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown28(memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown28", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown30(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown30", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown37(memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown37", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown39(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown39", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown46(memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown46", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown48(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown48", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown55(memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown55", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown57(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown57", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown63(memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown63", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown64(memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown64", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown66(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown66", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown68(memref<4xf16, "cuda">) -> memref<4xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown68", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown69(memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">) attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 4 : i32, __byre__arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], __byre__kernel_name = "Unknown69", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32, 4 : i32, 5 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown70(memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [2 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown70", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown74(memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown74", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown78(memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 98 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown78", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown89(memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown89", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown93(memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 196 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown93", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown108(memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown108", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown112(memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 392 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown112", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown127(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown127", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32, 3 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown131(memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown131", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown143(memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 784 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown143", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown144(memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 3136 : i32, __byre__arg_ranks = [4 : i32, 4 : i32, 4 : i32], __byre__kernel_name = "Unknown144", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32, 2 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown148(memref<f32, "cuda">) -> memref<f32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [0 : i32, 0 : i32], __byre__kernel_name = "Unknown148", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown149(memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 10 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown149", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown150(memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 36 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown150", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown154(memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 72 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown154", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown155(memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 144 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown155", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown156(memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 8 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown156", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown159(memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 288 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown159", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown160(memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 576 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown160", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown161(memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 32 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown161", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown164(memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1152 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown164", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown165(memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 2304 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown165", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown166(memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 128 : i32, __byre__arg_ranks = [4 : i32, 4 : i32], __byre__kernel_name = "Unknown166", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown170(memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 500 : i32, __byre__arg_ranks = [2 : i32, 2 : i32], __byre__kernel_name = "Unknown170", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
+  func.func private @Unknown172(memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda"> attributes {__byre__BlockSize.x = 256 : i32, __byre__GridSize.x = 1 : i32, __byre__arg_ranks = [1 : i32, 1 : i32], __byre__kernel_name = "Unknown172", __byteir_elementwise_fusion__, arg_offsets = [0 : i32, 1 : i32], byre_compute_name = "PTXOp", byre_force_compute_name, device = "cuda"}
   func.func @main(%arg0: memref<4x3x224x224xf32, "cuda">, %arg1: memref<4x1000xf32, "cuda">, %arg2: memref<64x3x7x7xf32, "cuda">, %arg3: memref<64xf32, "cuda">, %arg4: memref<64xf32, "cuda">, %arg5: memref<64xf32, "cuda">, %arg6: memref<64xf32, "cuda">, %arg7: memref<64x64x3x3xf32, "cuda">, %arg8: memref<64xf32, "cuda">, %arg9: memref<64xf32, "cuda">, %arg10: memref<64xf32, "cuda">, %arg11: memref<64xf32, "cuda">, %arg12: memref<64x64x3x3xf32, "cuda">, %arg13: memref<64xf32, "cuda">, %arg14: memref<64xf32, "cuda">, %arg15: memref<64xf32, "cuda">, %arg16: memref<64xf32, "cuda">, %arg17: memref<64x64x3x3xf32, "cuda">, %arg18: memref<64xf32, "cuda">, %arg19: memref<64xf32, "cuda">, %arg20: memref<64xf32, "cuda">, %arg21: memref<64xf32, "cuda">, %arg22: memref<64x64x3x3xf32, "cuda">, %arg23: memref<64xf32, "cuda">, %arg24: memref<64xf32, "cuda">, %arg25: memref<64xf32, "cuda">, %arg26: memref<64xf32, "cuda">, %arg27: memref<128x64x3x3xf32, "cuda">, %arg28: memref<128xf32, "cuda">, %arg29: memref<128xf32, "cuda">, %arg30: memref<128xf32, "cuda">, %arg31: memref<128xf32, "cuda">, %arg32: memref<128x128x3x3xf32, "cuda">, %arg33: memref<128xf32, "cuda">, %arg34: memref<128xf32, "cuda">, %arg35: memref<128xf32, "cuda">, %arg36: memref<128xf32, "cuda">, %arg37: memref<128x64x1x1xf32, "cuda">, %arg38: memref<128xf32, "cuda">, %arg39: memref<128xf32, "cuda">, %arg40: memref<128xf32, "cuda">, %arg41: memref<128xf32, "cuda">, %arg42: memref<128x128x3x3xf32, "cuda">, %arg43: memref<128xf32, "cuda">, %arg44: memref<128xf32, "cuda">, %arg45: memref<128xf32, "cuda">, %arg46: memref<128xf32, "cuda">, %arg47: memref<128x128x3x3xf32, "cuda">, %arg48: memref<128xf32, "cuda">, %arg49: memref<128xf32, "cuda">, %arg50: memref<128xf32, "cuda">, %arg51: memref<128xf32, "cuda">, %arg52: memref<256x128x3x3xf32, "cuda">, %arg53: memref<256xf32, "cuda">, %arg54: memref<256xf32, "cuda">, %arg55: memref<256xf32, "cuda">, %arg56: memref<256xf32, "cuda">, %arg57: memref<256x256x3x3xf32, "cuda">, %arg58: memref<256xf32, "cuda">, %arg59: memref<256xf32, "cuda">, %arg60: memref<256xf32, "cuda">, %arg61: memref<256xf32, "cuda">, %arg62: memref<256x128x1x1xf32, "cuda">, %arg63: memref<256xf32, "cuda">, %arg64: memref<256xf32, "cuda">, %arg65: memref<256xf32, "cuda">, %arg66: memref<256xf32, "cuda">, %arg67: memref<256x256x3x3xf32, "cuda">, %arg68: memref<256xf32, "cuda">, %arg69: memref<256xf32, "cuda">, %arg70: memref<256xf32, "cuda">, %arg71: memref<256xf32, "cuda">, %arg72: memref<256x256x3x3xf32, "cuda">, %arg73: memref<256xf32, "cuda">, %arg74: memref<256xf32, "cuda">, %arg75: memref<256xf32, "cuda">, %arg76: memref<256xf32, "cuda">, %arg77: memref<512x256x3x3xf32, "cuda">, %arg78: memref<512xf32, "cuda">, %arg79: memref<512xf32, "cuda">, %arg80: memref<512xf32, "cuda">, %arg81: memref<512xf32, "cuda">, %arg82: memref<512x512x3x3xf32, "cuda">, %arg83: memref<512xf32, "cuda">, %arg84: memref<512xf32, "cuda">, %arg85: memref<512xf32, "cuda">, %arg86: memref<512xf32, "cuda">, %arg87: memref<512x256x1x1xf32, "cuda">, %arg88: memref<512xf32, "cuda">, %arg89: memref<512xf32, "cuda">, %arg90: memref<512xf32, "cuda">, %arg91: memref<512xf32, "cuda">, %arg92: memref<512x512x3x3xf32, "cuda">, %arg93: memref<512xf32, "cuda">, %arg94: memref<512xf32, "cuda">, %arg95: memref<512xf32, "cuda">, %arg96: memref<512xf32, "cuda">, %arg97: memref<512x512x3x3xf32, "cuda">, %arg98: memref<512xf32, "cuda">, %arg99: memref<512xf32, "cuda">, %arg100: memref<512xf32, "cuda">, %arg101: memref<512xf32, "cuda">, %arg102: memref<1000x512xf32, "cuda">, %arg103: memref<1000xf32, "cuda">) -> (memref<f32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda">) attributes {__placeholder__byre.entry_point} {
     %0 = call @Unknown0(%arg0) : (memref<4x3x224x224xf32, "cuda">) -> memref<4x3x224x224xf16, "cuda">
     %1 = call @Unknown1(%arg2) : (memref<64x3x7x7xf32, "cuda">) -> memref<64x3x7x7xf16, "cuda">
@@ -4064,344 +2197,354 @@ module @IrToMhlo.2452 attributes {gpu.container_module} {
     %alloc_0 = memref.alloc() : memref<4x64x112x112xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc, %arg3, %arg4, %alloc_0) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">
     %2 = call @Unknown3(%arg7) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    %3 = call @Unknown4(%arg12) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    %4 = call @Unknown5(%arg17) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    %5 = call @Unknown6(%arg22) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %3 = call @Unknown3(%arg12) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %4 = call @Unknown3(%arg17) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    %5 = call @Unknown3(%arg22) : (memref<64x64x3x3xf32, "cuda">) -> memref<64x64x3x3xf16, "cuda">
     %6 = call @Unknown7(%arg37) : (memref<128x64x1x1xf32, "cuda">) -> memref<128x64x1x1xf16, "cuda">
     %7 = call @Unknown8(%arg27) : (memref<128x64x3x3xf32, "cuda">) -> memref<128x64x3x3xf16, "cuda">
     %8 = call @Unknown9(%arg32) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    %9 = call @Unknown10(%arg42) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    %10 = call @Unknown11(%arg47) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %9 = call @Unknown9(%arg42) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %10 = call @Unknown9(%arg47) : (memref<128x128x3x3xf32, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     %11 = call @Unknown12(%arg62) : (memref<256x128x1x1xf32, "cuda">) -> memref<256x128x1x1xf16, "cuda">
     %12 = call @Unknown13(%arg52) : (memref<256x128x3x3xf32, "cuda">) -> memref<256x128x3x3xf16, "cuda">
     %13 = call @Unknown14(%arg57) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    %14 = call @Unknown15(%arg67) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    %15 = call @Unknown16(%arg72) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    %14 = call @Unknown14(%arg67) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    %15 = call @Unknown14(%arg72) : (memref<256x256x3x3xf32, "cuda">) -> memref<256x256x3x3xf16, "cuda">
     %16 = call @Unknown17(%arg87) : (memref<512x256x1x1xf32, "cuda">) -> memref<512x256x1x1xf16, "cuda">
     %17 = call @Unknown18(%arg77) : (memref<512x256x3x3xf32, "cuda">) -> memref<512x256x3x3xf16, "cuda">
     %18 = call @Unknown19(%arg82) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    %19 = call @Unknown20(%arg92) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    %20 = call @Unknown21(%arg97) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %19 = call @Unknown19(%arg92) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    %20 = call @Unknown19(%arg97) : (memref<512x512x3x3xf32, "cuda">) -> memref<512x512x3x3xf16, "cuda">
     %21 = call @Unknown22(%arg1) : (memref<4x1000xf32, "cuda">) -> memref<4x1000xf16, "cuda">
     %22 = call @Unknown23(%arg102) : (memref<1000x512xf32, "cuda">) -> memref<1000x512xf16, "cuda">
+    %23 = call @Unknown24(%arg103) : (memref<1000xf32, "cuda">) -> memref<1000xf16, "cuda">
     %alloc_1 = memref.alloc() : memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%21, %alloc_1) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %23:2 = call @Unknown24(%alloc_0) : (memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">)
+    byre.compute @PTXOp(%21, %alloc_1) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %24:2 = call @Unknown26(%alloc_0) : (memref<4x64x112x112xf16, "cuda">) -> (memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">)
     %alloc_2 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxOp_f16_f16(%23#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxOp_f16_f16(%24#0, %alloc_2) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_3 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%alloc_2, %2, %alloc_3) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_4 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_3, %arg8, %arg9, %alloc_4) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %24:2 = call @Unknown26(%alloc_4) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
+    %25:2 = call @Unknown28(%alloc_4) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
     %alloc_5 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%24#0, %3, %alloc_5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%25#0, %3, %alloc_5) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_6 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_5, %arg13, %arg14, %alloc_6) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %25:2 = call @Unknown28(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
+    %26:2 = call @Unknown30(%alloc_6, %alloc_2) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
     %alloc_7 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%25#0, %4, %alloc_7) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%26#0, %4, %alloc_7) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_8 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_7, %arg18, %arg19, %alloc_8) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %26:2 = call @Unknown30(%alloc_8) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
+    %27:2 = call @Unknown28(%alloc_8) : (memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
     %alloc_9 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%26#0, %5, %alloc_9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%27#0, %5, %alloc_9) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_10 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_9, %arg23, %arg24, %alloc_10) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %27:2 = call @Unknown32(%alloc_10, %25#0) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
+    %28:2 = call @Unknown30(%alloc_10, %26#0) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">)
     %alloc_11 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%27#0, %6, %alloc_11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%28#0, %6, %alloc_11) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_12 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_11, %arg38, %arg39, %alloc_12) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_13 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%27#0, %7, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%28#0, %7, %alloc_13) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_14 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_13, %arg28, %arg29, %alloc_14) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %28:2 = call @Unknown35(%alloc_14) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
+    %29:2 = call @Unknown37(%alloc_14) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
     %alloc_15 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%28#0, %8, %alloc_15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%29#0, %8, %alloc_15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_16 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_15, %arg33, %arg34, %alloc_16) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %29:2 = call @Unknown37(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
+    %30:2 = call @Unknown39(%alloc_16, %alloc_12) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
     %alloc_17 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%29#0, %9, %alloc_17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%30#0, %9, %alloc_17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_18 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_17, %arg43, %arg44, %alloc_18) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %30:2 = call @Unknown39(%alloc_18) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
+    %31:2 = call @Unknown37(%alloc_18) : (memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
     %alloc_19 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%30#0, %10, %alloc_19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%31#0, %10, %alloc_19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_20 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_19, %arg48, %arg49, %alloc_20) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %31:2 = call @Unknown41(%alloc_20, %29#0) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
+    %32:2 = call @Unknown39(%alloc_20, %30#0) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">) -> (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">)
     %alloc_21 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%31#0, %11, %alloc_21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%32#0, %11, %alloc_21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_22 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_21, %arg63, %arg64, %alloc_22) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_23 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%31#0, %12, %alloc_23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%32#0, %12, %alloc_23) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_24 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_23, %arg53, %arg54, %alloc_24) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %32:2 = call @Unknown44(%alloc_24) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
+    %33:2 = call @Unknown46(%alloc_24) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
     %alloc_25 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%32#0, %13, %alloc_25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%33#0, %13, %alloc_25) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_26 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_25, %arg58, %arg59, %alloc_26) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %33:2 = call @Unknown46(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
+    %34:2 = call @Unknown48(%alloc_26, %alloc_22) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
     %alloc_27 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%33#0, %14, %alloc_27) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%34#0, %14, %alloc_27) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_28 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_27, %arg68, %arg69, %alloc_28) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %34:2 = call @Unknown48(%alloc_28) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
+    %35:2 = call @Unknown46(%alloc_28) : (memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
     %alloc_29 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%34#0, %15, %alloc_29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%35#0, %15, %alloc_29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_30 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_29, %arg73, %arg74, %alloc_30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %35:2 = call @Unknown50(%alloc_30, %33#0) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
+    %36:2 = call @Unknown48(%alloc_30, %34#0) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">) -> (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">)
     %alloc_31 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%35#0, %16, %alloc_31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%36#0, %16, %alloc_31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_32 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_31, %arg88, %arg89, %alloc_32) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_33 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%35#0, %17, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%36#0, %17, %alloc_33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_34 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_33, %arg78, %arg79, %alloc_34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %36:2 = call @Unknown53(%alloc_34) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
+    %37:2 = call @Unknown55(%alloc_34) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
     %alloc_35 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%36#0, %18, %alloc_35) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37#0, %18, %alloc_35) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_36 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_35, %arg83, %arg84, %alloc_36) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %37:2 = call @Unknown55(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
+    %38:2 = call @Unknown57(%alloc_36, %alloc_32) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
     %alloc_37 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%37#0, %19, %alloc_37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%38#0, %19, %alloc_37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_38 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_37, %arg93, %arg94, %alloc_38) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %38:2 = call @Unknown57(%alloc_38) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
+    %39:2 = call @Unknown55(%alloc_38) : (memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
     %alloc_39 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%38#0, %20, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%39#0, %20, %alloc_39) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_40 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%alloc_39, %arg98, %arg99, %alloc_40) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %39:2 = call @Unknown59(%alloc_40, %37#0) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
-    %alloc_41 = memref.alloc() : memref<4x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%39#0, %alloc_41) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda">
-    %40 = call @Unknown60(%alloc_41) : (memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda">
+    %40:2 = call @Unknown57(%alloc_40, %38#0) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">) -> (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">)
+    %collapse_shape = memref.collapse_shape %40#0 [[0, 1], [2, 3]] : memref<4x512x7x7xf16, "cuda"> into memref<2048x49xf16, "cuda">
+    %alloc_41 = memref.alloc() : memref<2048xf16, "cuda">
+    byre.compute @PTXOp(%collapse_shape, %alloc_41) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda">
+    %expand_shape = memref.expand_shape %alloc_41 [[0, 1]] : memref<2048xf16, "cuda"> into memref<4x512xf16, "cuda">
+    %41 = call @Unknown63(%expand_shape) : (memref<4x512xf16, "cuda">) -> memref<4x512xf16, "cuda">
     %alloc_42 = memref.alloc() : memref<4x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%40, %22, %alloc_42) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %41 = call @Unknown61(%arg103, %alloc_42) : (memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%41, %22, %alloc_42) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %42 = call @Unknown64(%23, %alloc_42) : (memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda">
     %alloc_43 = memref.alloc() : memref<4xf16, "cuda">
-    byre.compute @ReduceMaxOp_f16_f16(%41, %alloc_43) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %42:2 = call @Unknown62(%alloc_43, %41) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">)
+    byre.compute @PTXOp(%42, %alloc_43) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %43 = call @Unknown66(%alloc_43, %42) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> memref<4x1000xf16, "cuda">
     %alloc_44 = memref.alloc() : memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%42#1, %alloc_44) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %43:3 = call @Unknown63(%alloc_44, %42#0, %alloc_1, %21, %arg1) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">)
+    byre.compute @PTXOp(%43, %alloc_44) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %44 = call @Unknown68(%alloc_44) : (memref<4xf16, "cuda">) -> memref<4xf16, "cuda">
+    %45:2 = call @Unknown69(%44, %43, %alloc_1, %21) : (memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">) -> (memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">)
     %alloc_45 = memref.alloc() : memref<4x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%43#0, %22, %alloc_45) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %44 = call @Unknown64(%alloc_45, %39#1) : (memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%45#1, %22, %alloc_45) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %46 = call @Unknown70(%alloc_45, %40#1) : (memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda">
     %alloc_46 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     %alloc_47 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_48 = memref.alloc() : memref<512xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %44, %alloc_46, %alloc_47, %alloc_48) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_39, %arg98, %46, %alloc_46, %alloc_47, %alloc_48) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %alloc_49 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_46, %20, %alloc_49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_50 = memref.alloc() : memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %45 = call @Unknown68(%38#1, %alloc_49) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%39#0, %alloc_46, %alloc_50) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %47 = call @Unknown74(%39#1, %alloc_49) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda">
     %alloc_51 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     %alloc_52 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_53 = memref.alloc() : memref<512xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %45, %alloc_51, %alloc_52, %alloc_53) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_37, %arg93, %47, %alloc_51, %alloc_52, %alloc_53) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %alloc_54 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_51, %19, %alloc_54) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_55 = memref.alloc() : memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %46 = call @Unknown72(%44, %alloc_54, %37#1) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38#0, %alloc_51, %alloc_55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %48 = call @Unknown78(%46, %alloc_54, %38#1) : (memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">) -> memref<4x512x7x7xf16, "cuda">
     %alloc_56 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     %alloc_57 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_58 = memref.alloc() : memref<512xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %46, %alloc_56, %alloc_57, %alloc_58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_35, %arg83, %48, %alloc_56, %alloc_57, %alloc_58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %alloc_59 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_56, %18, %alloc_59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
     %alloc_60 = memref.alloc() : memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %47 = call @Unknown76(%36#1, %alloc_59) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37#0, %alloc_56, %alloc_60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %49 = call @Unknown74(%37#1, %alloc_59) : (memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">) -> memref<4x512x7x7xf16, "cuda">
     %alloc_61 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     %alloc_62 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_63 = memref.alloc() : memref<512xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %47, %alloc_61, %alloc_62, %alloc_63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_33, %arg78, %49, %alloc_61, %alloc_62, %alloc_63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %alloc_64 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_61, %17, %alloc_64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_65 = memref.alloc() : memref<512x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_61, %alloc_65) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
     %alloc_66 = memref.alloc() : memref<4x512x7x7xf16, "cuda">
     %alloc_67 = memref.alloc() : memref<512xf32, "cuda">
     %alloc_68 = memref.alloc() : memref<512xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %46, %alloc_66, %alloc_67, %alloc_68) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_31, %arg88, %48, %alloc_66, %alloc_67, %alloc_68) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
     %alloc_69 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_66, %16, %alloc_69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_70 = memref.alloc() : memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %48 = call @Unknown83(%alloc_69, %alloc_64, %35#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%36#0, %alloc_66, %alloc_70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %50 = call @Unknown89(%alloc_69, %alloc_64, %36#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda">
     %alloc_71 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     %alloc_72 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_73 = memref.alloc() : memref<256xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %48, %alloc_71, %alloc_72, %alloc_73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_29, %arg73, %50, %alloc_71, %alloc_72, %alloc_73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %alloc_74 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_71, %15, %alloc_74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_75 = memref.alloc() : memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %49 = call @Unknown87(%34#1, %alloc_74) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35#0, %alloc_71, %alloc_75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %51 = call @Unknown93(%35#1, %alloc_74) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda">
     %alloc_76 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     %alloc_77 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_78 = memref.alloc() : memref<256xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %49, %alloc_76, %alloc_77, %alloc_78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_27, %arg68, %51, %alloc_76, %alloc_77, %alloc_78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %alloc_79 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_76, %14, %alloc_79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_80 = memref.alloc() : memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %50 = call @Unknown91(%48, %alloc_79, %33#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%34#0, %alloc_76, %alloc_80) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %52 = call @Unknown89(%50, %alloc_79, %34#1) : (memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">) -> memref<4x256x14x14xf16, "cuda">
     %alloc_81 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     %alloc_82 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_83 = memref.alloc() : memref<256xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %50, %alloc_81, %alloc_82, %alloc_83) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_25, %arg58, %52, %alloc_81, %alloc_82, %alloc_83) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %alloc_84 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_81, %13, %alloc_84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
     %alloc_85 = memref.alloc() : memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %51 = call @Unknown95(%32#1, %alloc_84) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%33#0, %alloc_81, %alloc_85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %53 = call @Unknown93(%33#1, %alloc_84) : (memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">) -> memref<4x256x14x14xf16, "cuda">
     %alloc_86 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     %alloc_87 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_88 = memref.alloc() : memref<256xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %51, %alloc_86, %alloc_87, %alloc_88) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_23, %arg53, %53, %alloc_86, %alloc_87, %alloc_88) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %alloc_89 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_86, %12, %alloc_89) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_90 = memref.alloc() : memref<256x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_86, %alloc_90) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
     %alloc_91 = memref.alloc() : memref<4x256x14x14xf16, "cuda">
     %alloc_92 = memref.alloc() : memref<256xf32, "cuda">
     %alloc_93 = memref.alloc() : memref<256xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %50, %alloc_91, %alloc_92, %alloc_93) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_21, %arg63, %52, %alloc_91, %alloc_92, %alloc_93) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
     %alloc_94 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_91, %11, %alloc_94) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_95 = memref.alloc() : memref<256x128x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %52 = call @Unknown102(%alloc_94, %alloc_89, %31#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32#0, %alloc_91, %alloc_95) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
+    %54 = call @Unknown108(%alloc_94, %alloc_89, %32#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     %alloc_96 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     %alloc_97 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_98 = memref.alloc() : memref<128xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %52, %alloc_96, %alloc_97, %alloc_98) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_19, %arg48, %54, %alloc_96, %alloc_97, %alloc_98) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %alloc_99 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_96, %10, %alloc_99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_100 = memref.alloc() : memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %53 = call @Unknown106(%30#1, %alloc_99) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31#0, %alloc_96, %alloc_100) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %55 = call @Unknown112(%31#1, %alloc_99) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     %alloc_101 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     %alloc_102 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_103 = memref.alloc() : memref<128xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %53, %alloc_101, %alloc_102, %alloc_103) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_17, %arg43, %55, %alloc_101, %alloc_102, %alloc_103) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %alloc_104 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_101, %9, %alloc_104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_105 = memref.alloc() : memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %54 = call @Unknown110(%52, %alloc_104, %29#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30#0, %alloc_101, %alloc_105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %56 = call @Unknown108(%54, %alloc_104, %30#1) : (memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     %alloc_106 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     %alloc_107 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_108 = memref.alloc() : memref<128xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %54, %alloc_106, %alloc_107, %alloc_108) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_15, %arg33, %56, %alloc_106, %alloc_107, %alloc_108) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %alloc_109 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_106, %8, %alloc_109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     %alloc_110 = memref.alloc() : memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %55 = call @Unknown114(%28#1, %alloc_109) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29#0, %alloc_106, %alloc_110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %57 = call @Unknown112(%29#1, %alloc_109) : (memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     %alloc_111 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     %alloc_112 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_113 = memref.alloc() : memref<128xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %55, %alloc_111, %alloc_112, %alloc_113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_13, %arg28, %57, %alloc_111, %alloc_112, %alloc_113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %alloc_114 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_111, %7, %alloc_114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_115 = memref.alloc() : memref<128x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_111, %alloc_115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
     %alloc_116 = memref.alloc() : memref<4x128x28x28xf16, "cuda">
     %alloc_117 = memref.alloc() : memref<128xf32, "cuda">
     %alloc_118 = memref.alloc() : memref<128xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %54, %alloc_116, %alloc_117, %alloc_118) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_11, %arg38, %56, %alloc_116, %alloc_117, %alloc_118) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
     %alloc_119 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_116, %6, %alloc_119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_120 = memref.alloc() : memref<128x64x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %56 = call @Unknown121(%alloc_119, %alloc_114, %27#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28#0, %alloc_116, %alloc_120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %58 = call @Unknown127(%alloc_119, %alloc_114, %28#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     %alloc_121 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     %alloc_122 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_123 = memref.alloc() : memref<64xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %56, %alloc_121, %alloc_122, %alloc_123) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_9, %arg23, %58, %alloc_121, %alloc_122, %alloc_123) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %alloc_124 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_121, %5, %alloc_124) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_125 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %57 = call @Unknown125(%26#1, %alloc_124) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%27#0, %alloc_121, %alloc_125) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %59 = call @Unknown131(%27#1, %alloc_124) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     %alloc_126 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     %alloc_127 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_128 = memref.alloc() : memref<64xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %57, %alloc_126, %alloc_127, %alloc_128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_7, %arg18, %59, %alloc_126, %alloc_127, %alloc_128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %alloc_129 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_126, %4, %alloc_129) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_130 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %58 = call @Unknown129(%56, %alloc_129, %25#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%26#0, %alloc_126, %alloc_130) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %60 = call @Unknown127(%58, %alloc_129, %26#1) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     %alloc_131 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     %alloc_132 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_133 = memref.alloc() : memref<64xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %58, %alloc_131, %alloc_132, %alloc_133) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_5, %arg13, %60, %alloc_131, %alloc_132, %alloc_133) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %alloc_134 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_131, %3, %alloc_134) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_135 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%24#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %59 = call @Unknown133(%24#1, %alloc_134) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%25#0, %alloc_131, %alloc_135) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %61 = call @Unknown131(%25#1, %alloc_134) : (memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     %alloc_136 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     %alloc_137 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_138 = memref.alloc() : memref<64xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %59, %alloc_136, %alloc_137, %alloc_138) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc_3, %arg8, %61, %alloc_136, %alloc_137, %alloc_138) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %alloc_139 = memref.alloc() : memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%alloc_136, %2, %alloc_139) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
     %alloc_140 = memref.alloc() : memref<64x64x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%alloc_2, %alloc_136, %alloc_140) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %60 = call @Unknown137(%58, %alloc_139) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %62 = call @Unknown143(%60, %alloc_139) : (memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     %alloc_141 = memref.alloc() : memref<4x64x112x112xf16, "cuda">
-    byre.compute @PoolMaxGradOp_f16f16_f16(%23#0, %60, %alloc_141) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %61 = call @Unknown138(%23#1, %alloc_141) : (memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    byre.compute @PoolMaxGradOp_f16f16_f16(%24#0, %62, %alloc_141) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    %63 = call @Unknown144(%24#1, %alloc_141) : (memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     %alloc_142 = memref.alloc() : memref<4x64x112x112xf16, "cuda">
     %alloc_143 = memref.alloc() : memref<64xf32, "cuda">
     %alloc_144 = memref.alloc() : memref<64xf32, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %61, %alloc_142, %alloc_143, %alloc_144) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%alloc, %arg3, %63, %alloc_142, %alloc_143, %alloc_144) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
     %alloc_145 = memref.alloc() : memref<64x3x7x7xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %alloc_142, %alloc_145) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
     %alloc_146 = memref.alloc() : memref<f32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%43#1, %alloc_146) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<f32, "cuda">
-    %62 = call @Unknown141(%alloc_146) : (memref<f32, "cuda">) -> memref<f32, "cuda">
-    %63 = call @Unknown142(%alloc_145) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda">
-    %64 = call @Unknown143(%alloc_140) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %65 = call @Unknown144(%alloc_135) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %66 = call @Unknown145(%alloc_130) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %67 = call @Unknown146(%alloc_125) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
-    %68 = call @Unknown147(%alloc_115) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda">
-    %69 = call @Unknown148(%alloc_110) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %70 = call @Unknown149(%alloc_120) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda">
-    %71 = call @Unknown150(%alloc_105) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %72 = call @Unknown151(%alloc_100) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
-    %73 = call @Unknown152(%alloc_90) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda">
-    %74 = call @Unknown153(%alloc_85) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %75 = call @Unknown154(%alloc_95) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda">
-    %76 = call @Unknown155(%alloc_80) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %77 = call @Unknown156(%alloc_75) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
-    %78 = call @Unknown157(%alloc_65) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda">
-    %79 = call @Unknown158(%alloc_60) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    %80 = call @Unknown159(%alloc_70) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda">
-    %81 = call @Unknown160(%alloc_55) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    %82 = call @Unknown161(%alloc_50) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
-    %alloc_147 = memref.alloc() : memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%40, %43#0, %alloc_147) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
-    %83 = call @Unknown163(%alloc_147) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda">
-    %alloc_148 = memref.alloc() : memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%43#2, %alloc_148) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    %84 = call @Unknown164(%alloc_148) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda">
-    return %62, %63, %alloc_143, %alloc_144, %64, %alloc_137, %alloc_138, %65, %alloc_132, %alloc_133, %66, %alloc_127, %alloc_128, %67, %alloc_122, %alloc_123, %68, %alloc_112, %alloc_113, %69, %alloc_107, %alloc_108, %70, %alloc_117, %alloc_118, %71, %alloc_102, %alloc_103, %72, %alloc_97, %alloc_98, %73, %alloc_87, %alloc_88, %74, %alloc_82, %alloc_83, %75, %alloc_92, %alloc_93, %76, %alloc_77, %alloc_78, %77, %alloc_72, %alloc_73, %78, %alloc_62, %alloc_63, %79, %alloc_57, %alloc_58, %80, %alloc_67, %alloc_68, %81, %alloc_52, %alloc_53, %82, %alloc_47, %alloc_48, %83, %84 : memref<f32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda">
+    %collapse_shape_147 = memref.collapse_shape %45#0 [[0, 1]] : memref<4x1000xf16, "cuda"> into memref<4000xf16, "cuda">
+    %collapse_shape_148 = memref.collapse_shape %arg1 [[0, 1]] : memref<4x1000xf32, "cuda"> into memref<4000xf32, "cuda">
+    %expand_shape_149 = memref.expand_shape %collapse_shape_147 [[0, 1]] : memref<4000xf16, "cuda"> into memref<32x125xf16, "cuda">
+    %expand_shape_150 = memref.expand_shape %collapse_shape_148 [[0, 1]] : memref<4000xf32, "cuda"> into memref<32x125xf32, "cuda">
+    %alloc_151 = memref.alloc() : memref<32xf32, "cuda">
+    byre.compute @PTXOp(%expand_shape_149, %expand_shape_150, %alloc_151) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda">
+    byre.compute @PTXOp(%alloc_151, %alloc_146) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref<f32, "cuda">
+    %64 = call @Unknown148(%alloc_146) : (memref<f32, "cuda">) -> memref<f32, "cuda">
+    %65 = call @Unknown149(%alloc_145) : (memref<64x3x7x7xf16, "cuda">) -> memref<64x3x7x7xf32, "cuda">
+    %66 = call @Unknown150(%alloc_140) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %67 = call @Unknown150(%alloc_135) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %68 = call @Unknown150(%alloc_130) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %69 = call @Unknown150(%alloc_125) : (memref<64x64x3x3xf16, "cuda">) -> memref<64x64x3x3xf32, "cuda">
+    %70 = call @Unknown154(%alloc_115) : (memref<128x64x3x3xf16, "cuda">) -> memref<128x64x3x3xf32, "cuda">
+    %71 = call @Unknown155(%alloc_110) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %72 = call @Unknown156(%alloc_120) : (memref<128x64x1x1xf16, "cuda">) -> memref<128x64x1x1xf32, "cuda">
+    %73 = call @Unknown155(%alloc_105) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %74 = call @Unknown155(%alloc_100) : (memref<128x128x3x3xf16, "cuda">) -> memref<128x128x3x3xf32, "cuda">
+    %75 = call @Unknown159(%alloc_90) : (memref<256x128x3x3xf16, "cuda">) -> memref<256x128x3x3xf32, "cuda">
+    %76 = call @Unknown160(%alloc_85) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %77 = call @Unknown161(%alloc_95) : (memref<256x128x1x1xf16, "cuda">) -> memref<256x128x1x1xf32, "cuda">
+    %78 = call @Unknown160(%alloc_80) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %79 = call @Unknown160(%alloc_75) : (memref<256x256x3x3xf16, "cuda">) -> memref<256x256x3x3xf32, "cuda">
+    %80 = call @Unknown164(%alloc_65) : (memref<512x256x3x3xf16, "cuda">) -> memref<512x256x3x3xf32, "cuda">
+    %81 = call @Unknown165(%alloc_60) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    %82 = call @Unknown166(%alloc_70) : (memref<512x256x1x1xf16, "cuda">) -> memref<512x256x1x1xf32, "cuda">
+    %83 = call @Unknown165(%alloc_55) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    %84 = call @Unknown165(%alloc_50) : (memref<512x512x3x3xf16, "cuda">) -> memref<512x512x3x3xf32, "cuda">
+    %alloc_152 = memref.alloc() : memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%41, %45#1, %alloc_152) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
+    %85 = call @Unknown170(%alloc_152) : (memref<1000x512xf16, "cuda">) -> memref<1000x512xf32, "cuda">
+    %alloc_153 = memref.alloc() : memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%45#1, %alloc_153) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    %86 = call @Unknown172(%alloc_153) : (memref<1000xf32, "cuda">) -> memref<1000xf32, "cuda">
+    return %64, %65, %alloc_143, %alloc_144, %66, %alloc_137, %alloc_138, %67, %alloc_132, %alloc_133, %68, %alloc_127, %alloc_128, %69, %alloc_122, %alloc_123, %70, %alloc_112, %alloc_113, %71, %alloc_107, %alloc_108, %72, %alloc_117, %alloc_118, %73, %alloc_102, %alloc_103, %74, %alloc_97, %alloc_98, %75, %alloc_87, %alloc_88, %76, %alloc_82, %alloc_83, %77, %alloc_92, %alloc_93, %78, %alloc_77, %alloc_78, %79, %alloc_72, %alloc_73, %80, %alloc_62, %alloc_63, %81, %alloc_57, %alloc_58, %82, %alloc_67, %alloc_68, %83, %alloc_52, %alloc_53, %84, %alloc_47, %alloc_48, %85, %86 : memref<f32, "cuda">, memref<64x3x7x7xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<64x64x3x3xf32, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<128x64x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x64x1x1xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<128x128x3x3xf32, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<256x128x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x128x1x1xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<256x256x3x3xf32, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<512x256x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x256x1x1xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<512x512x3x3xf32, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<1000x512xf32, "cuda">, memref<1000xf32, "cuda">
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir b/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir
index 9840706d4..e320376ae 100644
--- a/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/9a_byre_host.mlir
@@ -4,4295 +4,2464 @@
 
 module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
+    gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
       %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        %9 = arith.extf %8 : f16 to f32
+        memref.store %9, %arg1[%arg2] : memref<1000xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
+      %c131072 = arith.constant 131072 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
+    gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
+      %c32768 = arith.constant 32768 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
+      %c128 = arith.constant 128 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+    gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
+      %c8192 = arith.constant 8192 : index
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
+    gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
+      %c73728 = arith.constant 73728 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
+    gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
+      %c36864 = arith.constant 36864 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
+    gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown148(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 4.000000e+00 : f32
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1 step %6 {
+        %7 = memref.load %arg0[] : memref<f32>
+        %8 = arith.negf %7 : f32
+        %9 = arith.divf %8, %cst : f32
+        memref.store %9, %arg1[] : memref<f32>
       }
       gpu.return
     }
-    gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
+    gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+      %c3211264 = arith.constant 3211264 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown141(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
-      %cst = arith.constant 4.000000e+00 : f32
-      %c1 = arith.constant 1 : index
+    gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[] : memref<f32>
-        %7 = arith.negf %6 : f32
-        %8 = arith.divf %7, %cst : f32
-        memref.store %8, %arg1[] : memref<f32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+    gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11] : memref<4x512xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %15 = arith.divf %13, %cst : f16
+        %16 = arith.select %14, %15, %cst_0 : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg6 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg6, %c1000 : index
+        %8 = arith.divsi %arg6, %c1000 : index
+        %9 = memref.load %arg2[%8] : memref<4xf16>
+        %10 = memref.load %arg0[%8] : memref<4xf16>
+        %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16>
+        %13 = arith.subf %11, %10 : f16
+        %14 = math.exp %13 : f16
+        %15 = arith.mulf %14, %9 : f16
+        %16 = arith.subf %12, %15 : f16
+        memref.store %13, %arg4[%8, %7] : memref<4x1000xf16>
+        memref.store %16, %arg5[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel {
+      %c4 = arith.constant 4 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<4xf16>
+        %8 = math.log %7 : f16
+        memref.store %8, %arg1[%arg2] : memref<4xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%8] : memref<4xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.subf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%7] : memref<1000xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.addf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
+      %c2048 = arith.constant 2048 : index
+      %cst = arith.constant 2.040100e-02 : f16
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2048 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x512xf16>
+        %10 = arith.mulf %9, %cst : f16
+        memref.store %10, %arg1[%8, %7] : memref<4x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
       %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29] : memref<4x512xf16>
-        %38 = arith.divf %37, %cst_0 : f16
-        %39 = arith.select %36, %38, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %18 = memref.load %arg0[%15] : memref<4xf16>
-        %19 = memref.load %arg2[%15] : memref<4xf16>
-        %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32>
-        %21 = math.log %18 : f16
-        %22 = arith.subf %17, %21 : f16
-        %23 = math.exp %22 : f16
-        %24 = arith.mulf %23, %19 : f16
-        %25 = arith.subf %16, %24 : f16
-        %26 = arith.extf %22 : f16 to f32
-        %27 = arith.mulf %26, %20 : f32
-        %28 = arith.extf %25 : f16 to f32
-        memref.store %25, %arg5[%15, %9] : memref<4x1000xf16>
-        memref.store %27, %arg6[%15, %9] : memref<4x1000xf32>
-        memref.store %28, %arg7[%15, %9] : memref<4x1000xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%15] : memref<4xf16>
-        %18 = arith.subf %16, %17 : f16
-        %19 = math.exp %18 : f16
-        memref.store %18, %arg2[%15, %9] : memref<4x1000xf16>
-        memref.store %19, %arg3[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%9] : memref<1000xf32>
-        %18 = arith.truncf %17 : f32 to f16
-        %19 = arith.addf %16, %18 : f16
-        memref.store %19, %arg2[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c2048 = arith.constant 2048 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2048 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x512xf16>
-        %17 = arith.mulf %16, %cst : f16
-        memref.store %17, %arg1[%15, %9] : memref<4x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+    gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
+      %c3211264 = arith.constant 3211264 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel {
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        memref.store %8, %arg1[%arg2] : memref<1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel {
-      %cst = arith.constant -2.500000e-01 : f32
-      %c0 = arith.constant 0 : index
       %c4000 = arith.constant 4000 : index
+      %cst = arith.constant -2.500000e-01 : f32
       %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32>
-        %17 = arith.mulf %16, %cst : f32
-        %18 = arith.truncf %17 : f32 to f16
-        memref.store %18, %arg1[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg2, %c1000 : index
+        %8 = arith.divsi %arg2, %c1000 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32>
+        %10 = arith.mulf %9, %cst : f32
+        %11 = arith.truncf %10 : f32 to f16
+        memref.store %11, %arg1[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
+      %c602112 = arith.constant 602112 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c224 = arith.constant 224 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c602112 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.addf %18, %cst : f16
+      %20 = arith.cmpi ugt, %16, %c1 : index
+      %21 = scf.if %20 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %22 = arith.addf %19, %21 : f16
+      memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c256 : index
+      scf.if %23 {
+        %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c128 : index
+      scf.if %24 {
+        %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c64 : index
+      scf.if %25 {
+        %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c32 : index
+      scf.if %26 {
+        %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c16 : index
+      scf.if %27 {
+        %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c8 : index
+      scf.if %28 {
+        %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c4 : index
+      scf.if %29 {
+        %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c2 : index
+      scf.if %30 {
+        %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %31 = arith.cmpi ult, %1, %c1 : index
+      scf.if %31 {
+        %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 2048, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<2048xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c602112 = arith.constant 602112 : index
-      %c224 = arith.constant 224 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.cmpi ugt, %16, %c1 : index
+      %20 = scf.if %19 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %21 = arith.maximumf %18, %20 : f16
+      memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c256 : index
+      scf.if %22 {
+        %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c128 : index
+      scf.if %23 {
+        %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c64 : index
+      scf.if %24 {
+        %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c32 : index
+      scf.if %25 {
+        %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c16 : index
+      scf.if %26 {
+        %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c8 : index
+      scf.if %27 {
+        %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c4 : index
+      scf.if %28 {
+        %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c2 : index
+      scf.if %29 {
+        %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %30 = arith.cmpi ult, %1, %c1 : index
+      scf.if %30 {
+        %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = math.exp %18 : f16
+      %20 = arith.addf %19, %cst : f16
+      %21 = arith.cmpi ugt, %16, %c1 : index
+      %22 = scf.if %21 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %23 = math.exp %22 : f16
+      %24 = arith.addf %20, %23 : f16
+      memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c256 : index
+      scf.if %25 {
+        %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c128 : index
+      scf.if %26 {
+        %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c64 : index
+      scf.if %27 {
+        %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c32 : index
+      scf.if %28 {
+        %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c16 : index
+      scf.if %29 {
+        %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c8 : index
+      scf.if %30 {
+        %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %1, %c4 : index
+      scf.if %31 {
+        %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %32 = arith.cmpi ult, %1, %c2 : index
+      scf.if %32 {
+        %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %33 = arith.cmpi ult, %1, %c1 : index
+      scf.if %33 {
+        %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c2 = arith.constant 2 : index
+      %c64 = arith.constant 64 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c125 = arith.constant 125 : index
+      %c0 = arith.constant 0 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c602112 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16>
-      }
+      %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c128 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c128 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c125 : index
+      %7 = arith.select %6, %5, %c125 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c125 : index
+      %10 = arith.select %9, %8, %c125 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13:2 = scf.if %12 -> (f16, f32) {
+        %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+        scf.yield %24, %25 : f16, f32
+      } else {
+        scf.yield %cst_0, %cst : f16, f32
+      }
+      %14 = arith.extf %13#0 : f16 to f32
+      %15 = arith.mulf %14, %13#1 : f32
+      %16 = arith.addf %15, %cst : f32
+      memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c64 : index
+      scf.if %17 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c32 : index
+      scf.if %18 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c16 : index
+      scf.if %19 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %20 = arith.cmpi ult, %1, %c8 : index
+      scf.if %20 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %21 = arith.cmpi ult, %1, %c4 : index
+      scf.if %21 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c2 : index
+      scf.if %22 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %23 = arith.cmpi ult, %1, %c1 : index
+      scf.if %23 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %arg2[%0] : memref<32xf32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref<f32>) kernel attributes {gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %c32 = arith.constant 32 : index
+      %0 = gpu.block_id  x
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %0, %c32 : index
+      %3 = arith.addi %2, %1 : index
+      %4 = memref.load %arg0[%3] : memref<32xf32>
+      %5 = arith.addf %4, %cst : f32
+      memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %6 = arith.cmpi ult, %1, %c16 : index
+      scf.if %6 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %7 = arith.cmpi ult, %1, %c8 : index
+      scf.if %7 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %8 = arith.cmpi ult, %1, %c4 : index
+      scf.if %8 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %9 = arith.cmpi ult, %1, %c2 : index
+      scf.if %9 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %10 = arith.cmpi ult, %1, %c1 : index
+      scf.if %10 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %arg1[] : memref<f32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array<i32: 32, 2, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c2 = arith.constant 2 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c32 = arith.constant 32 : index
+      %c1000 = arith.constant 1000 : index
+      %c-32 = arith.constant -32 : index
+      %0 = gpu.block_id  x
+      %1 = arith.muli %0, %c-32 : index
+      %2 = arith.addi %1, %c1000 : index
+      %3 = arith.cmpi slt, %2, %c32 : index
+      %4 = arith.select %3, %2, %c32 : index
+      %5 = arith.muli %0, %c32 : index
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      %6 = gpu.thread_id  x
+      %7 = gpu.thread_id  y
+      %8 = arith.cmpi slt, %4, %6 : index
+      %9 = arith.select %8, %4, %6 : index
+      %10 = arith.addi %6, %c1 : index
+      %11 = arith.cmpi slt, %4, %10 : index
+      %12 = arith.select %11, %4, %10 : index
+      %13 = arith.subi %12, %9 : index
+      %14 = arith.cmpi ugt, %13, %c0 : index
+      %15 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %5, %9 : index
+        %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16>
+        scf.yield %24 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %16 = arith.extf %15 : f16 to f32
+      %17 = arith.addf %16, %cst : f32
+      %18 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %22, %c1 : index
+        %24 = arith.addi %5, %9 : index
+        %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16>
+        scf.yield %25 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %19 = arith.extf %18 : f16 to f32
+      %20 = arith.addf %17, %19 : f32
+      memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %21 = arith.cmpi ult, %7, %c1 : index
+      scf.if %21 {
+        %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f32
+        %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %23 : f32
+        memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
       gpu.return
     }
   }
   func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<f32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<76022848xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %alloc = memref.alloc() : memref<76533504xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
-    byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda">
+    byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
+    byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %49 = "byre.alias"(%alloc) {offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %51 = "byre.alias"(%alloc) {offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %52 = "byre.alias"(%alloc) {offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %53 = "byre.alias"(%alloc) {offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %54 = "byre.alias"(%alloc) {offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %55 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %56 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %57 = "byre.alias"(%alloc) {offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %58 = "byre.alias"(%alloc) {offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %59 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %60 = "byre.alias"(%alloc) {offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %61 = "byre.alias"(%alloc) {offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %62 = "byre.alias"(%alloc) {offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %63 = "byre.alias"(%alloc) {offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %64 = "byre.alias"(%alloc) {offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %65 = "byre.alias"(%alloc) {offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %66 = "byre.alias"(%alloc) {offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %67 = "byre.alias"(%alloc) {offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %68 = "byre.alias"(%alloc) {offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %69 = "byre.alias"(%alloc) {offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %70 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %71 = "byre.alias"(%alloc) {offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %72 = "byre.alias"(%alloc) {offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %73 = "byre.alias"(%alloc) {offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %74 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %75 = "byre.alias"(%alloc) {offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %76 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %77 = "byre.alias"(%alloc) {offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %78 = "byre.alias"(%alloc) {offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %79 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %80 = "byre.alias"(%alloc) {offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %81 = "byre.alias"(%alloc) {offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %82 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %83 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %84 = "byre.alias"(%alloc) {offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %85 = "byre.alias"(%alloc) {offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %86 = "byre.alias"(%alloc) {offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %87 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda">
-    %88 = "byre.alias"(%alloc) {offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %89 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %90 = "byre.alias"(%alloc) {offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %91 = "byre.alias"(%alloc) {offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %92 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %93 = "byre.alias"(%alloc) {offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %94 = "byre.alias"(%alloc) {offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    %95 = "byre.alias"(%alloc) {offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    %96 = "byre.alias"(%alloc) {offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">
-    %97 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %98 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %99 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %100 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %101 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %102 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %103 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %104 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %105 = "byre.alias"(%alloc) {offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %106 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %107 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda">
+    %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda">
+    byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda">
+    %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda">
+    %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %108 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %109 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %110 = "byre.alias"(%alloc) {offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %111 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %112 = "byre.alias"(%alloc) {offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %113 = "byre.alias"(%alloc) {offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %114 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %115 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %116 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %117 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %118 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %119 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %120 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %121 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %122 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<f32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    %123 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    %124 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> : (memref<76533504xi8, "cuda">) -> memref<f32, "cuda">
+    %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda">
+    %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda">
+    %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda">
+    byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda">
+    byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
     return
   }
 }
\ No newline at end of file
diff --git a/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir b/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir
index 1cd7dd3b4..1e4d27090 100644
--- a/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/9b_nvvm_codegen.mlir
@@ -1,4298 +1,2464 @@
-// RUN: byteir-opt %s -nvvm-codegen | FileCheck %s
-
-// CHECK-LABEL: gpu.module @unified
-
 module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} {
   gpu.module @unified {
-    gpu.func @Unknown164(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
+    gpu.func @Unknown172(%arg0: memref<1000xf32>, %arg1: memref<1000xf32>) kernel {
       %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1000 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[%4] : memref<1000xf32>
-        %7 = arith.truncf %6 : f32 to f16
-        %8 = arith.extf %7 : f16 to f32
-        memref.store %8, %arg1[%4] : memref<1000xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        %9 = arith.extf %8 : f16 to f32
+        memref.store %9, %arg1[%arg2] : memref<1000xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown163(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown170(%arg0: memref<1000x512xf16>, %arg1: memref<1000x512xf32>) kernel {
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown161(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown166(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
+      %c131072 = arith.constant 131072 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown160(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown165(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown159(%arg0: memref<512x256x1x1xf16>, %arg1: memref<512x256x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c131072 = arith.constant 131072 : index
+    gpu.func @Unknown164(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
+      %c1179648 = arith.constant 1179648 : index
       %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown158(%arg0: memref<512x512x3x3xf16>, %arg1: memref<512x512x3x3xf32>) kernel {
+    gpu.func @Unknown161(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
+      %c32768 = arith.constant 32768 : index
       %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown157(%arg0: memref<512x256x3x3xf16>, %arg1: memref<512x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown160(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+      %c589824 = arith.constant 589824 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown156(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown159(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
+      %c294912 = arith.constant 294912 : index
+      %c128 = arith.constant 128 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown155(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
+    gpu.func @Unknown156(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
+      %c8192 = arith.constant 8192 : index
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
+        %10 = arith.extf %9 : f16 to f32
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown154(%arg0: memref<256x128x1x1xf16>, %arg1: memref<256x128x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c32768 = arith.constant 32768 : index
+    gpu.func @Unknown155(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
+      %c147456 = arith.constant 147456 : index
       %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown153(%arg0: memref<256x256x3x3xf16>, %arg1: memref<256x256x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
+    gpu.func @Unknown154(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
+      %c73728 = arith.constant 73728 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown152(%arg0: memref<256x128x3x3xf16>, %arg1: memref<256x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c294912 = arith.constant 294912 : index
+    gpu.func @Unknown150(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
+      %c36864 = arith.constant 36864 : index
+      %c64 = arith.constant 64 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown151(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
+    gpu.func @Unknown149(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf16>
+        %14 = arith.extf %13 : f16 to f32
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf32>
       }
       gpu.return
     }
-    gpu.func @Unknown150(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown148(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 4.000000e+00 : f32
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1 step %6 {
+        %7 = memref.load %arg0[] : memref<f32>
+        %8 = arith.negf %7 : f32
+        %9 = arith.divf %8, %cst : f32
+        memref.store %9, %arg1[] : memref<f32>
       }
       gpu.return
     }
-    gpu.func @Unknown149(%arg0: memref<128x64x1x1xf16>, %arg1: memref<128x64x1x1xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c8192 = arith.constant 8192 : index
+    gpu.func @Unknown144(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+      %c3211264 = arith.constant 3211264 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
-        %17 = arith.extf %16 : f16 to f32
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown148(%arg0: memref<128x128x3x3xf16>, %arg1: memref<128x128x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown143(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown147(%arg0: memref<128x64x3x3xf16>, %arg1: memref<128x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown131(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown146(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
+    gpu.func @Unknown127(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
+      %c802816 = arith.constant 802816 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown145(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown112(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown144(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown108(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+      %c401408 = arith.constant 401408 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown143(%arg0: memref<64x64x3x3xf16>, %arg1: memref<64x64x3x3xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown93(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown142(%arg0: memref<64x3x7x7xf16>, %arg1: memref<64x3x7x7xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown89(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+      %c200704 = arith.constant 200704 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-        %37 = arith.extf %36 : f16 to f32
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown141(%arg0: memref<f32>, %arg1: memref<f32>) kernel {
-      %cst = arith.constant 4.000000e+00 : f32
-      %c1 = arith.constant 1 : index
+    gpu.func @Unknown78(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1 : index
-      scf.if %5 {
-        %6 = memref.load %arg0[] : memref<f32>
-        %7 = arith.negf %6 : f32
-        %8 = arith.divf %7, %cst : f32
-        memref.store %8, %arg1[] : memref<f32>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = memref.load %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %16 = arith.addf %13, %14 : f16
+        %17 = arith.select %15, %16, %cst : f16
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown138(%arg0: memref<4x64x112x112xi1>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xf16>) kernel {
+    gpu.func @Unknown74(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.select %13, %14, %cst : f16
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown137(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown70(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
+      %c100352 = arith.constant 100352 : index
+      %cst = arith.constant 4.900000e+01 : f16
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11] : memref<4x512xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xi1>
+        %15 = arith.divf %13, %cst : f16
+        %16 = arith.select %14, %15, %cst_0 : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown133(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown69(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf16>, %arg5: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg6 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg6, %c1000 : index
+        %8 = arith.divsi %arg6, %c1000 : index
+        %9 = memref.load %arg2[%8] : memref<4xf16>
+        %10 = memref.load %arg0[%8] : memref<4xf16>
+        %11 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %12 = memref.load %arg3[%8, %7] : memref<4x1000xf16>
+        %13 = arith.subf %11, %10 : f16
+        %14 = math.exp %13 : f16
+        %15 = arith.mulf %14, %9 : f16
+        %16 = arith.subf %12, %15 : f16
+        memref.store %13, %arg4[%8, %7] : memref<4x1000xf16>
+        memref.store %16, %arg5[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown129(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown68(%arg0: memref<4xf16>, %arg1: memref<4xf16>) kernel {
+      %c4 = arith.constant 4 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<4xf16>
+        %8 = math.log %7 : f16
+        memref.store %8, %arg1[%arg2] : memref<4xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown125(%arg0: memref<4x64x56x56xi1>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown66(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%8] : memref<4xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.subf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown121(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>, %arg3: memref<4x64x56x56xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown64(%arg0: memref<1000xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
+      %c4000 = arith.constant 4000 : index
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg3, %c1000 : index
+        %8 = arith.divsi %arg3, %c1000 : index
+        %9 = memref.load %arg0[%7] : memref<1000xf16>
+        %10 = memref.load %arg1[%8, %7] : memref<4x1000xf16>
+        %11 = arith.addf %10, %9 : f16
+        memref.store %11, %arg2[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown114(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+    gpu.func @Unknown63(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
+      %c2048 = arith.constant 2048 : index
+      %cst = arith.constant 2.040100e-02 : f16
+      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2048 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x512xf16>
+        %10 = arith.mulf %9, %cst : f16
+        memref.store %10, %arg1[%8, %7] : memref<4x512xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown110(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg4, %c7 : index
+        %8 = arith.divsi %arg4, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown106(%arg0: memref<4x128x28x28xi1>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
+      %c100352 = arith.constant 100352 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c512 = arith.constant 512 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c100352 step %6 {
+        %7 = arith.remsi %arg3, %c7 : index
+        %8 = arith.divsi %arg3, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x512x7x7xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x512x7x7xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown102(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>, %arg3: memref<4x128x28x28xf16>) kernel {
+    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
+      %c200704 = arith.constant 200704 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg4, %c14 : index
+        %8 = arith.divsi %arg4, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown95(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
       %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
+      %cst = arith.constant 0.000000e+00 : f16
       %c256 = arith.constant 256 : index
+      %c14 = arith.constant 14 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c200704 step %6 {
+        %7 = arith.remsi %arg3, %c14 : index
+        %8 = arith.divsi %arg3, %c14 : index
+        %9 = arith.remsi %8, %c14 : index
+        %10 = arith.divsi %8, %c14 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x256x14x14xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x256x14x14xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown91(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg4, %c28 : index
+        %8 = arith.divsi %arg4, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown87(%arg0: memref<4x256x14x14xi1>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
+      %c401408 = arith.constant 401408 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c128 = arith.constant 128 : index
+      %c28 = arith.constant 28 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c401408 step %6 {
+        %7 = arith.remsi %arg3, %c28 : index
+        %8 = arith.divsi %arg3, %c28 : index
+        %9 = arith.remsi %8, %c28 : index
+        %10 = arith.divsi %8, %c28 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x128x28x28xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x128x28x28xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown83(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>, %arg3: memref<4x256x14x14xf16>) kernel {
+    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
+      %c64 = arith.constant 64 : index
+      %c56 = arith.constant 56 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg4 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg4, %c56 : index
+        %8 = arith.divsi %arg4, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = memref.load %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %15 = arith.addf %13, %14 : f16
+        %16 = arith.maximumf %15, %cst : f16
+        %17 = arith.cmpf ogt, %16, %cst : f16
+        memref.store %16, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %17, %arg3[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown76(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
+    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+      %c802816 = arith.constant 802816 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown72(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>, %arg3: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %39 = arith.addf %37, %38 : f16
-        %40 = arith.select %36, %39, %cst : f16
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown68(%arg0: memref<4x512x7x7xi1>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.select %36, %37, %cst : f16
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown64(%arg0: memref<4x512xf16>, %arg1: memref<4x512x7x7xi1>, %arg2: memref<4x512x7x7xf16>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %cst_0 = arith.constant 4.900000e+01 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-        %37 = memref.load %arg0[%35, %29] : memref<4x512xf16>
-        %38 = arith.divf %37, %cst_0 : f16
-        %39 = arith.select %36, %38, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown63(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4xf16>, %arg3: memref<4x1000xf16>, %arg4: memref<4x1000xf32>, %arg5: memref<4x1000xf16>, %arg6: memref<4x1000xf32>, %arg7: memref<4x1000xf32>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg3[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %18 = memref.load %arg0[%15] : memref<4xf16>
-        %19 = memref.load %arg2[%15] : memref<4xf16>
-        %20 = memref.load %arg4[%15, %9] : memref<4x1000xf32>
-        %21 = math.log %18 : f16
-        %22 = arith.subf %17, %21 : f16
-        %23 = math.exp %22 : f16
-        %24 = arith.mulf %23, %19 : f16
-        %25 = arith.subf %16, %24 : f16
-        %26 = arith.extf %22 : f16 to f32
-        %27 = arith.mulf %26, %20 : f32
-        %28 = arith.extf %25 : f16 to f32
-        memref.store %25, %arg5[%15, %9] : memref<4x1000xf16>
-        memref.store %27, %arg6[%15, %9] : memref<4x1000xf32>
-        memref.store %28, %arg7[%15, %9] : memref<4x1000xf32>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown62(%arg0: memref<4xf16>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>, %arg3: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%15] : memref<4xf16>
-        %18 = arith.subf %16, %17 : f16
-        %19 = math.exp %18 : f16
-        memref.store %18, %arg2[%15, %9] : memref<4x1000xf16>
-        memref.store %19, %arg3[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown61(%arg0: memref<1000xf32>, %arg1: memref<4x1000xf16>, %arg2: memref<4x1000xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c4000 = arith.constant 4000 : index
-      %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg1[%15, %9] : memref<4x1000xf16>
-        %17 = memref.load %arg0[%9] : memref<1000xf32>
-        %18 = arith.truncf %17 : f32 to f16
-        %19 = arith.addf %16, %18 : f16
-        memref.store %19, %arg2[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown60(%arg0: memref<4x512xf16>, %arg1: memref<4x512xf16>) kernel {
-      %cst = arith.constant 2.040100e-02 : f16
-      %c0 = arith.constant 0 : index
-      %c2048 = arith.constant 2048 : index
-      %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2048 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x512xf16>
-        %17 = arith.mulf %16, %cst : f16
-        memref.store %17, %arg1[%15, %9] : memref<4x512xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown59(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown57(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown55(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xf16>, %arg3: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown53(%arg0: memref<4x512x7x7xf16>, %arg1: memref<4x512x7x7xf16>, %arg2: memref<4x512x7x7xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c100352 = arith.constant 100352 : index
-      %c7 = arith.constant 7 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c100352 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x512x7x7xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x512x7x7xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown50(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown48(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown46(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xf16>, %arg3: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown44(%arg0: memref<4x256x14x14xf16>, %arg1: memref<4x256x14x14xf16>, %arg2: memref<4x256x14x14xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c200704 = arith.constant 200704 : index
-      %c14 = arith.constant 14 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c200704 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c14 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c14 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c14 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c14 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c14 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c14 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x256x14x14xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x256x14x14xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown41(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown39(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown37(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xf16>, %arg3: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown35(%arg0: memref<4x128x28x28xf16>, %arg1: memref<4x128x28x28xf16>, %arg2: memref<4x128x28x28xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c401408 = arith.constant 401408 : index
-      %c28 = arith.constant 28 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c401408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c28 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c28 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c28 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c28 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c28 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c28 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x128x28x28xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x128x28x28xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown32(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown30(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown28(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xf16>, %arg3: memref<4x64x56x56xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
       %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = memref.load %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %38 = arith.addf %36, %37 : f16
-        %39 = arith.maxnumf %38, %cst : f16
-        %40 = arith.cmpf ogt, %39, %cst : f16
-        memref.store %39, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %40, %arg3[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c802816 step %6 {
+        %7 = arith.remsi %arg3, %c56 : index
+        %8 = arith.divsi %arg3, %c56 : index
+        %9 = arith.remsi %8, %c56 : index
+        %10 = arith.divsi %8, %c56 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x56x56xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x56x56xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown26(%arg0: memref<4x64x56x56xf16>, %arg1: memref<4x64x56x56xf16>, %arg2: memref<4x64x56x56xi1>) kernel {
+    gpu.func @Unknown26(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
+      %c3211264 = arith.constant 3211264 : index
       %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c802816 = arith.constant 802816 : index
-      %c56 = arith.constant 56 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c112 = arith.constant 112 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c802816 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c56 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c56 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c56 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c56 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c56 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c56 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x56x56xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x56x56xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg3 = %4 to %c3211264 step %6 {
+        %7 = arith.remsi %arg3, %c112 : index
+        %8 = arith.divsi %arg3, %c112 : index
+        %9 = arith.remsi %8, %c112 : index
+        %10 = arith.divsi %8, %c112 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        %14 = arith.maximumf %13, %cst : f16
+        %15 = arith.cmpf ogt, %14, %cst : f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x64x112x112xf16>
+        memref.store %15, %arg2[%12, %11, %9, %7] : memref<4x64x112x112xi1>
       }
       gpu.return
     }
-    gpu.func @Unknown24(%arg0: memref<4x64x112x112xf16>, %arg1: memref<4x64x112x112xf16>, %arg2: memref<4x64x112x112xi1>) kernel {
-      %cst = arith.constant 0.000000e+00 : f16
-      %c0 = arith.constant 0 : index
-      %c3211264 = arith.constant 3211264 : index
-      %c112 = arith.constant 112 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+    gpu.func @Unknown24(%arg0: memref<1000xf32>, %arg1: memref<1000xf16>) kernel {
+      %c1000 = arith.constant 1000 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c3211264 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c112 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c112 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c112 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c112 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c112 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c112 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        %37 = arith.maxnumf %36, %cst : f16
-        %38 = arith.cmpf ogt, %37, %cst : f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x64x112x112xf16>
-        memref.store %38, %arg2[%35, %29, %19, %9] : memref<4x64x112x112xi1>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1000 step %6 {
+        %7 = memref.load %arg0[%arg2] : memref<1000xf32>
+        %8 = arith.truncf %7 : f32 to f16
+        memref.store %8, %arg1[%arg2] : memref<1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown23(%arg0: memref<1000x512xf32>, %arg1: memref<1000x512xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c512000 = arith.constant 512000 : index
       %c512 = arith.constant 512 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c512000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c512 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c512 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c512 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<1000x512xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9] : memref<1000x512xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c512000 step %6 {
+        %7 = arith.remsi %arg2, %c512 : index
+        %8 = arith.divsi %arg2, %c512 : index
+        %9 = memref.load %arg0[%8, %7] : memref<1000x512xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7] : memref<1000x512xf16>
       }
       gpu.return
     }
     gpu.func @Unknown22(%arg0: memref<4x1000xf32>, %arg1: memref<4x1000xf16>) kernel {
-      %cst = arith.constant -2.500000e-01 : f32
-      %c0 = arith.constant 0 : index
       %c4000 = arith.constant 4000 : index
+      %cst = arith.constant -2.500000e-01 : f32
       %c1000 = arith.constant 1000 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c4000 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c1000 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c1000 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c1000 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9] : memref<4x1000xf32>
-        %17 = arith.mulf %16, %cst : f32
-        %18 = arith.truncf %17 : f32 to f16
-        memref.store %18, %arg1[%15, %9] : memref<4x1000xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown21(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown20(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c512 = arith.constant 512 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c4000 step %6 {
+        %7 = arith.remsi %arg2, %c1000 : index
+        %8 = arith.divsi %arg2, %c1000 : index
+        %9 = memref.load %arg0[%8, %7] : memref<4x1000xf32>
+        %10 = arith.mulf %9, %cst : f32
+        %11 = arith.truncf %10 : f32 to f16
+        memref.store %11, %arg1[%8, %7] : memref<4x1000xf16>
       }
       gpu.return
     }
     gpu.func @Unknown19(%arg0: memref<512x512x3x3xf32>, %arg1: memref<512x512x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c2359296 = arith.constant 2359296 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c512 = arith.constant 512 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c2359296 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c512 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c512 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c512 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x512x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x512x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c2359296 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c512 : index
+        %12 = arith.divsi %10, %c512 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x512x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x512x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown18(%arg0: memref<512x256x3x3xf32>, %arg1: memref<512x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c1179648 = arith.constant 1179648 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c1179648 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<512x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<512x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c1179648 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<512x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<512x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown17(%arg0: memref<512x256x1x1xf32>, %arg1: memref<512x256x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c131072 = arith.constant 131072 : index
-      %c256 = arith.constant 256 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c131072 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c256 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c256 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c256 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<512x256x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<512x256x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown16(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c256 = arith.constant 256 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown15(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c131072 step %6 {
+        %7 = arith.remsi %arg2, %c256 : index
+        %8 = arith.divsi %arg2, %c256 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<512x256x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<512x256x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown14(%arg0: memref<256x256x3x3xf32>, %arg1: memref<256x256x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c589824 = arith.constant 589824 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c256 = arith.constant 256 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c589824 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c256 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c256 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c256 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x256x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x256x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c589824 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c256 : index
+        %12 = arith.divsi %10, %c256 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x256x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x256x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown13(%arg0: memref<256x128x3x3xf32>, %arg1: memref<256x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c294912 = arith.constant 294912 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c294912 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<256x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<256x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c294912 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<256x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<256x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown12(%arg0: memref<256x128x1x1xf32>, %arg1: memref<256x128x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c32768 = arith.constant 32768 : index
-      %c128 = arith.constant 128 : index
-      %c-1 = arith.constant -1 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c32768 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c128 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c128 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c128 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<256x128x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<256x128x1x1xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown11(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c128 = arith.constant 128 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown10(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c32768 step %6 {
+        %7 = arith.remsi %arg2, %c128 : index
+        %8 = arith.divsi %arg2, %c128 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<256x128x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<256x128x1x1xf16>
       }
       gpu.return
     }
     gpu.func @Unknown9(%arg0: memref<128x128x3x3xf32>, %arg1: memref<128x128x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c147456 = arith.constant 147456 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c128 = arith.constant 128 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c147456 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c128 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c128 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c128 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x128x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x128x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c147456 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c128 : index
+        %12 = arith.divsi %10, %c128 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x128x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x128x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown8(%arg0: memref<128x64x3x3xf32>, %arg1: memref<128x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c73728 = arith.constant 73728 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c73728 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<128x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<128x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c73728 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<128x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<128x64x3x3xf16>
       }
       gpu.return
     }
     gpu.func @Unknown7(%arg0: memref<128x64x1x1xf32>, %arg1: memref<128x64x1x1xf16>) kernel {
-      %c0 = arith.constant 0 : index
       %c8192 = arith.constant 8192 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
-      %c-1 = arith.constant -1 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c8192 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c64 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c64 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c64 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = memref.load %arg0[%15, %9, %c0, %c0] : memref<128x64x1x1xf32>
-        %17 = arith.truncf %16 : f32 to f16
-        memref.store %17, %arg1[%15, %9, %c0, %c0] : memref<128x64x1x1xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c8192 step %6 {
+        %7 = arith.remsi %arg2, %c64 : index
+        %8 = arith.divsi %arg2, %c64 : index
+        %9 = memref.load %arg0[%8, %7, %c0, %c0] : memref<128x64x1x1xf32>
+        %10 = arith.truncf %9 : f32 to f16
+        memref.store %10, %arg1[%8, %7, %c0, %c0] : memref<128x64x1x1xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown6(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
+    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
       %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
       %c64 = arith.constant 64 : index
+      %c3 = arith.constant 3 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c36864 step %6 {
+        %7 = arith.remsi %arg2, %c3 : index
+        %8 = arith.divsi %arg2, %c3 : index
+        %9 = arith.remsi %8, %c3 : index
+        %10 = arith.divsi %8, %c3 : index
+        %11 = arith.remsi %10, %c64 : index
+        %12 = arith.divsi %10, %c64 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x64x3x3xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x64x3x3xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown5(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
+      %c9408 = arith.constant 9408 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c7 = arith.constant 7 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c9408 step %6 {
+        %7 = arith.remsi %arg2, %c7 : index
+        %8 = arith.divsi %arg2, %c7 : index
+        %9 = arith.remsi %8, %c7 : index
+        %10 = arith.divsi %8, %c7 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<64x3x7x7xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<64x3x7x7xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown4(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
+    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
+      %c602112 = arith.constant 602112 : index
       %c3 = arith.constant 3 : index
-      %c-1 = arith.constant -1 : index
-      %c64 = arith.constant 64 : index
+      %c224 = arith.constant 224 : index
       %0 = gpu.block_id  x
       %1 = gpu.block_dim  x
       %2 = gpu.thread_id  x
       %3 = arith.muli %1, %0 : index
       %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
+      %5 = gpu.grid_dim  x
+      %6 = arith.muli %1, %5 : index
+      scf.for %arg2 = %4 to %c602112 step %6 {
+        %7 = arith.remsi %arg2, %c224 : index
+        %8 = arith.divsi %arg2, %c224 : index
+        %9 = arith.remsi %8, %c224 : index
+        %10 = arith.divsi %8, %c224 : index
+        %11 = arith.remsi %10, %c3 : index
+        %12 = arith.divsi %10, %c3 : index
+        %13 = memref.load %arg0[%12, %11, %9, %7] : memref<4x3x224x224xf32>
+        %14 = arith.truncf %13 : f32 to f16
+        memref.store %14, %arg1[%12, %11, %9, %7] : memref<4x3x224x224xf16>
       }
       gpu.return
     }
-    gpu.func @Unknown3(%arg0: memref<64x64x3x3xf32>, %arg1: memref<64x64x3x3xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c36864 = arith.constant 36864 : index
-      %c3 = arith.constant 3 : index
+    gpu.func @Unknown25_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.addf %18, %cst : f16
+      %20 = arith.cmpi ugt, %16, %c1 : index
+      %21 = scf.if %20 -> (f16) {
+        %32 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %32 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %22 = arith.addf %19, %21 : f16
+      memref.store %22, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c256 : index
+      scf.if %23 {
+        %32 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca[%34] : memref<512xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c128 : index
+      scf.if %24 {
+        %32 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_2[%34] : memref<256xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c64 : index
+      scf.if %25 {
+        %32 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_3[%34] : memref<128xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c32 : index
+      scf.if %26 {
+        %32 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_4[%34] : memref<64xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c16 : index
+      scf.if %27 {
+        %32 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_5[%34] : memref<32xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c8 : index
+      scf.if %28 {
+        %32 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_6[%34] : memref<16xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c4 : index
+      scf.if %29 {
+        %32 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_7[%34] : memref<8xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c2 : index
+      scf.if %30 {
+        %32 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_8[%34] : memref<4xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %31 = arith.cmpi ult, %1, %c1 : index
+      scf.if %31 {
+        %32 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %33 = arith.addf %32, %cst : f16
+        %34 = arith.addi %2, %c1 : index
+        %35 = memref.load %alloca_9[%34] : memref<2xf16, #gpu.address_space<workgroup>>
+        %36 = arith.addf %35, %33 : f16
+        memref.store %36, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown62_kernel(%arg0: memref<2048x49xf16>, %arg1: memref<2048xf16>) kernel attributes {gpu.known_block_size = array<i32: 64, 1, 1>, gpu.known_grid_size = array<i32: 2048, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c2 = arith.constant 2 : index
+      %c32 = arith.constant 32 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c49 = arith.constant 49 : index
+      %c0 = arith.constant 0 : index
       %c64 = arith.constant 64 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c36864 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c3 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c3 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c3 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c3 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c3 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c3 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c64 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c64 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c64 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x64x3x3xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x64x3x3xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown1(%arg0: memref<64x3x7x7xf32>, %arg1: memref<64x3x7x7xf16>) kernel {
-      %c0 = arith.constant 0 : index
-      %c9408 = arith.constant 9408 : index
-      %c7 = arith.constant 7 : index
+      %subview = memref.subview %arg0[%0, 0] [1, 49] [1, 1] : memref<2048x49xf16> to memref<49xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<49xf16, strided<[1], offset: ?>> into memref<1x49xf16, strided<[49, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c64 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c64 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c49 : index
+      %7 = arith.select %6, %5, %c49 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c49 : index
+      %10 = arith.select %9, %8, %c49 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_0 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x49xf16, strided<[49, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13 = scf.if %12 -> (f16) {
+        %21 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %21 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %14 = arith.addf %13, %cst : f16
+      memref.store %14, %alloca[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %15 = arith.cmpi ult, %1, %c32 : index
+      scf.if %15 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca[%21] : memref<64xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca[%24] : memref<64xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_2[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %16 = arith.cmpi ult, %1, %c16 : index
+      scf.if %16 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_2[%21] : memref<32xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_2[%24] : memref<32xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_3[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c8 : index
+      scf.if %17 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_3[%21] : memref<16xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_3[%24] : memref<16xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_4[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c4 : index
+      scf.if %18 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_4[%21] : memref<8xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_4[%24] : memref<8xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_5[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c2 : index
+      scf.if %19 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_5[%21] : memref<4xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_5[%24] : memref<4xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %alloca_6[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %20 = arith.cmpi ult, %1, %c1 : index
+      scf.if %20 {
+        %21 = arith.muli %1, %c2 : index
+        %22 = memref.load %alloca_6[%21] : memref<2xf16, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f16
+        %24 = arith.addi %21, %c1 : index
+        %25 = memref.load %alloca_6[%24] : memref<2xf16, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %23 : f16
+        memref.store %26, %arg1[%0] : memref<2048xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown65_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
-      %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c9408 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c7 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c7 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c7 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c7 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c7 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c7 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<64x3x7x7xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<64x3x7x7xf16>
-      }
-      gpu.return
-    }
-    gpu.func @Unknown0(%arg0: memref<4x3x224x224xf32>, %arg1: memref<4x3x224x224xf16>) kernel {
       %c0 = arith.constant 0 : index
-      %c602112 = arith.constant 602112 : index
-      %c224 = arith.constant 224 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = arith.cmpi ugt, %16, %c1 : index
+      %20 = scf.if %19 -> (f16) {
+        %31 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %31 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %21 = arith.maximumf %18, %20 : f16
+      memref.store %21, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c256 : index
+      scf.if %22 {
+        %31 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca[%32] : memref<512xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %23 = arith.cmpi ult, %1, %c128 : index
+      scf.if %23 {
+        %31 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_2[%32] : memref<256xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %24 = arith.cmpi ult, %1, %c64 : index
+      scf.if %24 {
+        %31 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_3[%32] : memref<128xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c32 : index
+      scf.if %25 {
+        %31 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_4[%32] : memref<64xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c16 : index
+      scf.if %26 {
+        %31 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_5[%32] : memref<32xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c8 : index
+      scf.if %27 {
+        %31 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_6[%32] : memref<16xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c4 : index
+      scf.if %28 {
+        %31 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_7[%32] : memref<8xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c2 : index
+      scf.if %29 {
+        %31 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_8[%32] : memref<4xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %30 = arith.cmpi ult, %1, %c1 : index
+      scf.if %30 {
+        %31 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %32 = arith.addi %2, %c1 : index
+        %33 = memref.load %alloca_9[%32] : memref<2xf16, #gpu.address_space<workgroup>>
+        %34 = arith.maximumf %33, %31 : f16
+        memref.store %34, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown67_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<4xf16>) kernel attributes {gpu.known_block_size = array<i32: 512, 1, 1>, gpu.known_grid_size = array<i32: 4, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c64 = arith.constant 64 : index
+      %c128 = arith.constant 128 : index
+      %c256 = arith.constant 256 : index
+      %c1 = arith.constant 1 : index
+      %cst = arith.constant 0.000000e+00 : f16
+      %c1000 = arith.constant 1000 : index
+      %c-1024 = arith.constant -1024 : index
+      %c512 = arith.constant 512 : index
       %c-1 = arith.constant -1 : index
-      %c3 = arith.constant 3 : index
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %0 = gpu.block_id  x
+      %subview = memref.subview %arg0[%0, 0] [1, 1000] [1, 1] : memref<4x1000xf16> to memref<1000xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<1000xf16, strided<[1], offset: ?>> into memref<1x1000xf16, strided<[1000, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<512xf16, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %1, %c2 : index
+      %3 = arith.cmpi slt, %1, %c0 : index
+      %4 = arith.subi %c-1, %1 : index
+      %5 = arith.select %3, %4, %1 : index
+      %6 = arith.divsi %5, %c512 : index
+      %7 = arith.subi %c-1, %6 : index
+      %8 = arith.select %3, %7, %6 : index
+      %9 = arith.muli %8, %c-1024 : index
+      %10 = arith.addi %2, %9 : index
+      %11 = arith.cmpi slt, %10, %c1000 : index
+      %12 = arith.select %11, %10, %c1000 : index
+      %13 = arith.addi %10, %c2 : index
+      %14 = arith.cmpi slt, %13, %c1000 : index
+      %15 = arith.select %14, %13, %c1000 : index
+      %16 = arith.subi %15, %12 : index
+      %subview_0 = memref.subview %expand_shape[0, %12] [1, %16] [1, 1] : memref<1x1000xf16, strided<[1000, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_1 = memref.expand_shape %subview_0 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %17 = arith.cmpi ugt, %16, %c0 : index
+      %18 = scf.if %17 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %19 = math.exp %18 : f16
+      %20 = arith.addf %19, %cst : f16
+      %21 = arith.cmpi ugt, %16, %c1 : index
+      %22 = scf.if %21 -> (f16) {
+        %34 = memref.load %expand_shape_1[%c0, %c1] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        scf.yield %34 : f16
+      } else {
+        scf.yield %cst : f16
+      }
+      %23 = math.exp %22 : f16
+      %24 = arith.addf %20, %23 : f16
+      memref.store %24, %alloca[%1] : memref<512xf16, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<256xf16, #gpu.address_space<workgroup>>
+      %25 = arith.cmpi ult, %1, %c256 : index
+      scf.if %25 {
+        %34 = memref.load %alloca[%2] : memref<512xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca[%36] : memref<512xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_2[%1] : memref<256xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<128xf16, #gpu.address_space<workgroup>>
+      %26 = arith.cmpi ult, %1, %c128 : index
+      scf.if %26 {
+        %34 = memref.load %alloca_2[%2] : memref<256xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_2[%36] : memref<256xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_3[%1] : memref<128xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_4 = memref.alloca() : memref<64xf16, #gpu.address_space<workgroup>>
+      %27 = arith.cmpi ult, %1, %c64 : index
+      scf.if %27 {
+        %34 = memref.load %alloca_3[%2] : memref<128xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_3[%36] : memref<128xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_4[%1] : memref<64xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_5 = memref.alloca() : memref<32xf16, #gpu.address_space<workgroup>>
+      %28 = arith.cmpi ult, %1, %c32 : index
+      scf.if %28 {
+        %34 = memref.load %alloca_4[%2] : memref<64xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_4[%36] : memref<64xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_5[%1] : memref<32xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_6 = memref.alloca() : memref<16xf16, #gpu.address_space<workgroup>>
+      %29 = arith.cmpi ult, %1, %c16 : index
+      scf.if %29 {
+        %34 = memref.load %alloca_5[%2] : memref<32xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_5[%36] : memref<32xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_6[%1] : memref<16xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<8xf16, #gpu.address_space<workgroup>>
+      %30 = arith.cmpi ult, %1, %c8 : index
+      scf.if %30 {
+        %34 = memref.load %alloca_6[%2] : memref<16xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_6[%36] : memref<16xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_7[%1] : memref<8xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<4xf16, #gpu.address_space<workgroup>>
+      %31 = arith.cmpi ult, %1, %c4 : index
+      scf.if %31 {
+        %34 = memref.load %alloca_7[%2] : memref<8xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_7[%36] : memref<8xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_8[%1] : memref<4xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<2xf16, #gpu.address_space<workgroup>>
+      %32 = arith.cmpi ult, %1, %c2 : index
+      scf.if %32 {
+        %34 = memref.load %alloca_8[%2] : memref<4xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_8[%36] : memref<4xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %alloca_9[%1] : memref<2xf16, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %33 = arith.cmpi ult, %1, %c1 : index
+      scf.if %33 {
+        %34 = memref.load %alloca_9[%2] : memref<2xf16, #gpu.address_space<workgroup>>
+        %35 = arith.addf %34, %cst : f16
+        %36 = arith.addi %2, %c1 : index
+        %37 = memref.load %alloca_9[%36] : memref<2xf16, #gpu.address_space<workgroup>>
+        %38 = arith.addf %37, %35 : f16
+        memref.store %38, %arg1[%0] : memref<4xf16>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel(%arg0: memref<32x125xf16>, %arg1: memref<32x125xf32>, %arg2: memref<32xf32>) kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c2 = arith.constant 2 : index
+      %c64 = arith.constant 64 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c1 = arith.constant 1 : index
+      %c125 = arith.constant 125 : index
+      %c0 = arith.constant 0 : index
+      %c128 = arith.constant 128 : index
       %0 = gpu.block_id  x
-      %1 = gpu.block_dim  x
-      %2 = gpu.thread_id  x
-      %3 = arith.muli %1, %0 : index
-      %4 = arith.addi %2, %3 : index
-      %5 = arith.cmpi slt, %4, %c602112 : index
-      scf.if %5 {
-        %6 = arith.remsi %4, %c224 : index
-        %7 = arith.cmpi slt, %6, %c0 : index
-        %8 = arith.addi %6, %c224 : index
-        %9 = arith.select %7, %8, %6 : index
-        %10 = arith.cmpi slt, %4, %c0 : index
-        %11 = arith.subi %c-1, %4 : index
-        %12 = arith.select %10, %11, %4 : index
-        %13 = arith.divsi %12, %c224 : index
-        %14 = arith.subi %c-1, %13 : index
-        %15 = arith.select %10, %14, %13 : index
-        %16 = arith.remsi %15, %c224 : index
-        %17 = arith.cmpi slt, %16, %c0 : index
-        %18 = arith.addi %16, %c224 : index
-        %19 = arith.select %17, %18, %16 : index
-        %20 = arith.cmpi slt, %15, %c0 : index
-        %21 = arith.subi %c-1, %15 : index
-        %22 = arith.select %20, %21, %15 : index
-        %23 = arith.divsi %22, %c224 : index
-        %24 = arith.subi %c-1, %23 : index
-        %25 = arith.select %20, %24, %23 : index
-        %26 = arith.remsi %25, %c3 : index
-        %27 = arith.cmpi slt, %26, %c0 : index
-        %28 = arith.addi %26, %c3 : index
-        %29 = arith.select %27, %28, %26 : index
-        %30 = arith.cmpi slt, %25, %c0 : index
-        %31 = arith.subi %c-1, %25 : index
-        %32 = arith.select %30, %31, %25 : index
-        %33 = arith.divsi %32, %c3 : index
-        %34 = arith.subi %c-1, %33 : index
-        %35 = arith.select %30, %34, %33 : index
-        %36 = memref.load %arg0[%35, %29, %19, %9] : memref<4x3x224x224xf32>
-        %37 = arith.truncf %36 : f32 to f16
-        memref.store %37, %arg1[%35, %29, %19, %9] : memref<4x3x224x224xf16>
-      }
+      %subview = memref.subview %arg0[%0, 0] [1, 125] [1, 1] : memref<32x125xf16> to memref<125xf16, strided<[1], offset: ?>>
+      %expand_shape = memref.expand_shape %subview [[0, 1]] : memref<125xf16, strided<[1], offset: ?>> into memref<1x125xf16, strided<[125, 1], offset: ?>>
+      %subview_1 = memref.subview %arg1[%0, 0] [1, 125] [1, 1] : memref<32x125xf32> to memref<125xf32, strided<[1], offset: ?>>
+      %expand_shape_2 = memref.expand_shape %subview_1 [[0, 1]] : memref<125xf32, strided<[1], offset: ?>> into memref<1x125xf32, strided<[125, 1], offset: ?>>
+      %alloca = memref.alloca() : memref<128xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.remsi %1, %c128 : index
+      %3 = arith.cmpi slt, %2, %c0 : index
+      %4 = arith.addi %2, %c128 : index
+      %5 = arith.select %3, %4, %2 : index
+      %6 = arith.cmpi slt, %5, %c125 : index
+      %7 = arith.select %6, %5, %c125 : index
+      %8 = arith.addi %5, %c1 : index
+      %9 = arith.cmpi slt, %8, %c125 : index
+      %10 = arith.select %9, %8, %c125 : index
+      %11 = arith.subi %10, %7 : index
+      %subview_3 = memref.subview %expand_shape[0, %7] [1, %11] [1, 1] : memref<1x125xf16, strided<[125, 1], offset: ?>> to memref<?xf16, strided<[1], offset: ?>>
+      %expand_shape_4 = memref.expand_shape %subview_3 [[0, 1]] : memref<?xf16, strided<[1], offset: ?>> into memref<1x?xf16, strided<[?, 1], offset: ?>>
+      %subview_5 = memref.subview %expand_shape_2[0, %7] [1, %11] [1, 1] : memref<1x125xf32, strided<[125, 1], offset: ?>> to memref<?xf32, strided<[1], offset: ?>>
+      %expand_shape_6 = memref.expand_shape %subview_5 [[0, 1]] : memref<?xf32, strided<[1], offset: ?>> into memref<1x?xf32, strided<[?, 1], offset: ?>>
+      %12 = arith.cmpi ugt, %11, %c0 : index
+      %13:2 = scf.if %12 -> (f16, f32) {
+        %24 = memref.load %expand_shape_4[%c0, %c0] : memref<1x?xf16, strided<[?, 1], offset: ?>>
+        %25 = memref.load %expand_shape_6[%c0, %c0] : memref<1x?xf32, strided<[?, 1], offset: ?>>
+        scf.yield %24, %25 : f16, f32
+      } else {
+        scf.yield %cst_0, %cst : f16, f32
+      }
+      %14 = arith.extf %13#0 : f16 to f32
+      %15 = arith.mulf %14, %13#1 : f32
+      %16 = arith.addf %15, %cst : f32
+      memref.store %16, %alloca[%1] : memref<128xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_7 = memref.alloca() : memref<64xf32, #gpu.address_space<workgroup>>
+      %17 = arith.cmpi ult, %1, %c64 : index
+      scf.if %17 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca[%24] : memref<128xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca[%27] : memref<128xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_7[%1] : memref<64xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_8 = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %18 = arith.cmpi ult, %1, %c32 : index
+      scf.if %18 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_7[%24] : memref<64xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_7[%27] : memref<64xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_8[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_9 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %19 = arith.cmpi ult, %1, %c16 : index
+      scf.if %19 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_8[%24] : memref<32xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_8[%27] : memref<32xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_9[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_10 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %20 = arith.cmpi ult, %1, %c8 : index
+      scf.if %20 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_9[%24] : memref<16xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_9[%27] : memref<16xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_10[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_11 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %21 = arith.cmpi ult, %1, %c4 : index
+      scf.if %21 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_10[%24] : memref<8xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_10[%27] : memref<8xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_11[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_12 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %22 = arith.cmpi ult, %1, %c2 : index
+      scf.if %22 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_11[%24] : memref<4xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_11[%27] : memref<4xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %alloca_12[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %23 = arith.cmpi ult, %1, %c1 : index
+      scf.if %23 {
+        %24 = arith.muli %1, %c2 : index
+        %25 = memref.load %alloca_12[%24] : memref<2xf32, #gpu.address_space<workgroup>>
+        %26 = arith.addf %25, %cst : f32
+        %27 = arith.addi %24, %c1 : index
+        %28 = memref.load %alloca_12[%27] : memref<2xf32, #gpu.address_space<workgroup>>
+        %29 = arith.addf %28, %26 : f32
+        memref.store %29, %arg2[%0] : memref<32xf32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown147_kernel_0(%arg0: memref<32xf32>, %arg1: memref<f32>) kernel attributes {gpu.known_block_size = array<i32: 32, 1, 1>, gpu.known_grid_size = array<i32: 1, 1, 1>} {
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %c1 = arith.constant 1 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %c32 = arith.constant 32 : index
+      %0 = gpu.block_id  x
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %1 = gpu.thread_id  x
+      %2 = arith.muli %0, %c32 : index
+      %3 = arith.addi %2, %1 : index
+      %4 = memref.load %arg0[%3] : memref<32xf32>
+      %5 = arith.addf %4, %cst : f32
+      memref.store %5, %alloca[%1] : memref<32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %alloca_0 = memref.alloca() : memref<16xf32, #gpu.address_space<workgroup>>
+      %6 = arith.cmpi ult, %1, %c16 : index
+      scf.if %6 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca[%11] : memref<32xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca[%14] : memref<32xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_0[%1] : memref<16xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_1 = memref.alloca() : memref<8xf32, #gpu.address_space<workgroup>>
+      %7 = arith.cmpi ult, %1, %c8 : index
+      scf.if %7 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_0[%11] : memref<16xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_0[%14] : memref<16xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_1[%1] : memref<8xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_2 = memref.alloca() : memref<4xf32, #gpu.address_space<workgroup>>
+      %8 = arith.cmpi ult, %1, %c4 : index
+      scf.if %8 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_1[%11] : memref<8xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_1[%14] : memref<8xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_2[%1] : memref<4xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %alloca_3 = memref.alloca() : memref<2xf32, #gpu.address_space<workgroup>>
+      %9 = arith.cmpi ult, %1, %c2 : index
+      scf.if %9 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_2[%11] : memref<4xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_2[%14] : memref<4xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %alloca_3[%1] : memref<2xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %10 = arith.cmpi ult, %1, %c1 : index
+      scf.if %10 {
+        %11 = arith.muli %1, %c2 : index
+        %12 = memref.load %alloca_3[%11] : memref<2xf32, #gpu.address_space<workgroup>>
+        %13 = arith.addf %12, %cst : f32
+        %14 = arith.addi %11, %c1 : index
+        %15 = memref.load %alloca_3[%14] : memref<2xf32, #gpu.address_space<workgroup>>
+        %16 = arith.addf %15, %13 : f32
+        memref.store %16, %arg1[] : memref<f32>
+      }
+      gpu.barrier
+      gpu.return
+    }
+    gpu.func @Unknown171_kernel(%arg0: memref<4x1000xf16>, %arg1: memref<1000xf32>) kernel attributes {gpu.known_block_size = array<i32: 32, 2, 1>, gpu.known_grid_size = array<i32: 32, 1, 1>} {
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_0 = arith.constant 0.000000e+00 : f16
+      %c2 = arith.constant 2 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c32 = arith.constant 32 : index
+      %c1000 = arith.constant 1000 : index
+      %c-32 = arith.constant -32 : index
+      %0 = gpu.block_id  x
+      %1 = arith.muli %0, %c-32 : index
+      %2 = arith.addi %1, %c1000 : index
+      %3 = arith.cmpi slt, %2, %c32 : index
+      %4 = arith.select %3, %2, %c32 : index
+      %5 = arith.muli %0, %c32 : index
+      %alloca = memref.alloca() : memref<32xf32, #gpu.address_space<workgroup>>
+      %alloca_1 = memref.alloca() : memref<2x32xf32, #gpu.address_space<workgroup>>
+      %6 = gpu.thread_id  x
+      %7 = gpu.thread_id  y
+      %8 = arith.cmpi slt, %4, %6 : index
+      %9 = arith.select %8, %4, %6 : index
+      %10 = arith.addi %6, %c1 : index
+      %11 = arith.cmpi slt, %4, %10 : index
+      %12 = arith.select %11, %4, %10 : index
+      %13 = arith.subi %12, %9 : index
+      %14 = arith.cmpi ugt, %13, %c0 : index
+      %15 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %5, %9 : index
+        %24 = memref.load %arg0[%22, %23] : memref<4x1000xf16>
+        scf.yield %24 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %16 = arith.extf %15 : f16 to f32
+      %17 = arith.addf %16, %cst : f32
+      %18 = scf.if %14 -> (f16) {
+        %22 = arith.muli %7, %c2 : index
+        %23 = arith.addi %22, %c1 : index
+        %24 = arith.addi %5, %9 : index
+        %25 = memref.load %arg0[%23, %24] : memref<4x1000xf16>
+        scf.yield %25 : f16
+      } else {
+        scf.yield %cst_0 : f16
+      }
+      %19 = arith.extf %18 : f16 to f32
+      %20 = arith.addf %17, %19 : f32
+      memref.store %20, %alloca_1[%7, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+      gpu.barrier
+      %21 = arith.cmpi ult, %7, %c1 : index
+      scf.if %21 {
+        %22 = memref.load %alloca_1[%c0, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %23 = arith.addf %22, %cst : f32
+        %24 = memref.load %alloca_1[%c1, %6] : memref<2x32xf32, #gpu.address_space<workgroup>>
+        %25 = arith.addf %24, %23 : f32
+        memref.store %25, %alloca[%6] : memref<32xf32, #gpu.address_space<workgroup>>
+      }
+      gpu.barrier
+      %subview = memref.subview %alloca[0] [%4] [1] : memref<32xf32, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>>
+      %subview_2 = memref.subview %arg1[%5] [%4] [1] : memref<1000xf32> to memref<?xf32, strided<[1], offset: ?>>
+      memref.copy %subview, %subview_2 : memref<?xf32, strided<[1]>, #gpu.address_space<workgroup>> to memref<?xf32, strided<[1], offset: ?>>
       gpu.return
     }
   }
   func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<f32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
-    %alloc = memref.alloc() : memref<76022848xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %alloc = memref.alloc() : memref<76533504xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %5 = "byre.alias"(%alloc) {offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %11 = "byre.alias"(%alloc) {offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
-    byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
-    %28 = "byre.alias"(%alloc) {offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %33 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %36 = "byre.alias"(%alloc) {offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %40 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda">
+    byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
+    byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %41 = "byre.alias"(%alloc) {offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %43 = "byre.alias"(%alloc) {offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %47 = "byre.alias"(%alloc) {offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %49 = "byre.alias"(%alloc) {offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %51 = "byre.alias"(%alloc) {offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %52 = "byre.alias"(%alloc) {offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %53 = "byre.alias"(%alloc) {offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %54 = "byre.alias"(%alloc) {offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %55 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %56 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %57 = "byre.alias"(%alloc) {offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %58 = "byre.alias"(%alloc) {offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %59 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %60 = "byre.alias"(%alloc) {offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %61 = "byre.alias"(%alloc) {offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %62 = "byre.alias"(%alloc) {offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %63 = "byre.alias"(%alloc) {offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %64 = "byre.alias"(%alloc) {offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %65 = "byre.alias"(%alloc) {offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %66 = "byre.alias"(%alloc) {offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %67 = "byre.alias"(%alloc) {offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %68 = "byre.alias"(%alloc) {offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %69 = "byre.alias"(%alloc) {offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %70 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %71 = "byre.alias"(%alloc) {offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %72 = "byre.alias"(%alloc) {offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %73 = "byre.alias"(%alloc) {offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %74 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %75 = "byre.alias"(%alloc) {offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %76 = "byre.alias"(%alloc) {offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %77 = "byre.alias"(%alloc) {offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %78 = "byre.alias"(%alloc) {offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %79 = "byre.alias"(%alloc) {offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %80 = "byre.alias"(%alloc) {offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %81 = "byre.alias"(%alloc) {offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %82 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %83 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %84 = "byre.alias"(%alloc) {offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %85 = "byre.alias"(%alloc) {offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %86 = "byre.alias"(%alloc) {offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %87 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda">
-    %88 = "byre.alias"(%alloc) {offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %89 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %90 = "byre.alias"(%alloc) {offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %91 = "byre.alias"(%alloc) {offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %92 = "byre.alias"(%alloc) {offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %93 = "byre.alias"(%alloc) {offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %94 = "byre.alias"(%alloc) {offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    %95 = "byre.alias"(%alloc) {offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    %96 = "byre.alias"(%alloc) {offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">
-    %97 = "byre.alias"(%alloc) {offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %98 = "byre.alias"(%alloc) {offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %99 = "byre.alias"(%alloc) {offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %100 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %101 = "byre.alias"(%alloc) {offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %102 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %103 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %104 = "byre.alias"(%alloc) {offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %105 = "byre.alias"(%alloc) {offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %106 = "byre.alias"(%alloc) {offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %107 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda">
+    %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda">
+    byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda">
+    %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda">
+    %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %108 = "byre.alias"(%alloc) {offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %109 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %110 = "byre.alias"(%alloc) {offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %111 = "byre.alias"(%alloc) {offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %112 = "byre.alias"(%alloc) {offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %113 = "byre.alias"(%alloc) {offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %114 = "byre.alias"(%alloc) {offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %115 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %116 = "byre.alias"(%alloc) {offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %117 = "byre.alias"(%alloc) {offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %118 = "byre.alias"(%alloc) {offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %119 = "byre.alias"(%alloc) {offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %120 = "byre.alias"(%alloc) {offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %121 = "byre.alias"(%alloc) {offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %122 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<f32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    %123 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    %124 = "byre.alias"(%alloc) {offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> : (memref<76533504xi8, "cuda">) -> memref<f32, "cuda">
+    %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda">
+    %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda">
+    %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda">
+    byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda">
+    byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
     return
   }
-}
\ No newline at end of file
+}
+
diff --git a/compiler/test/E2E/ResNet18/Whole/host_output.mlir b/compiler/test/E2E/ResNet18/Whole/host_output.mlir
index db4102fa0..0845dd8cc 100644
--- a/compiler/test/E2E/ResNet18/Whole/host_output.mlir
+++ b/compiler/test/E2E/ResNet18/Whole/host_output.mlir
@@ -4,327 +4,332 @@
 
 module @IrToMhlo.2452 attributes {byre.container_module, gpu.container_module} {
   func.func @main(%arg0: memref<4x3x224x224xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<4x1000xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Input2", byre.argtype = 1 : i32}, %arg3: memref<64xf32, "cuda"> {byre.argname = "Input3", byre.argtype = 1 : i32}, %arg4: memref<64xf32, "cuda"> {byre.argname = "Input4", byre.argtype = 1 : i32}, %arg5: memref<64xf32, "cuda"> {byre.argname = "Input5", byre.argtype = 1 : i32}, %arg6: memref<64xf32, "cuda"> {byre.argname = "Input6", byre.argtype = 1 : i32}, %arg7: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input7", byre.argtype = 1 : i32}, %arg8: memref<64xf32, "cuda"> {byre.argname = "Input8", byre.argtype = 1 : i32}, %arg9: memref<64xf32, "cuda"> {byre.argname = "Input9", byre.argtype = 1 : i32}, %arg10: memref<64xf32, "cuda"> {byre.argname = "Input10", byre.argtype = 1 : i32}, %arg11: memref<64xf32, "cuda"> {byre.argname = "Input11", byre.argtype = 1 : i32}, %arg12: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input12", byre.argtype = 1 : i32}, %arg13: memref<64xf32, "cuda"> {byre.argname = "Input13", byre.argtype = 1 : i32}, %arg14: memref<64xf32, "cuda"> {byre.argname = "Input14", byre.argtype = 1 : i32}, %arg15: memref<64xf32, "cuda"> {byre.argname = "Input15", byre.argtype = 1 : i32}, %arg16: memref<64xf32, "cuda"> {byre.argname = "Input16", byre.argtype = 1 : i32}, %arg17: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input17", byre.argtype = 1 : i32}, %arg18: memref<64xf32, "cuda"> {byre.argname = "Input18", byre.argtype = 1 : i32}, %arg19: memref<64xf32, "cuda"> {byre.argname = "Input19", byre.argtype = 1 : i32}, %arg20: memref<64xf32, "cuda"> {byre.argname = "Input20", byre.argtype = 1 : i32}, %arg21: memref<64xf32, "cuda"> {byre.argname = "Input21", byre.argtype = 1 : i32}, %arg22: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Input22", byre.argtype = 1 : i32}, %arg23: memref<64xf32, "cuda"> {byre.argname = "Input23", byre.argtype = 1 : i32}, %arg24: memref<64xf32, "cuda"> {byre.argname = "Input24", byre.argtype = 1 : i32}, %arg25: memref<64xf32, "cuda"> {byre.argname = "Input25", byre.argtype = 1 : i32}, %arg26: memref<64xf32, "cuda"> {byre.argname = "Input26", byre.argtype = 1 : i32}, %arg27: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Input27", byre.argtype = 1 : i32}, %arg28: memref<128xf32, "cuda"> {byre.argname = "Input28", byre.argtype = 1 : i32}, %arg29: memref<128xf32, "cuda"> {byre.argname = "Input29", byre.argtype = 1 : i32}, %arg30: memref<128xf32, "cuda"> {byre.argname = "Input30", byre.argtype = 1 : i32}, %arg31: memref<128xf32, "cuda"> {byre.argname = "Input31", byre.argtype = 1 : i32}, %arg32: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input32", byre.argtype = 1 : i32}, %arg33: memref<128xf32, "cuda"> {byre.argname = "Input33", byre.argtype = 1 : i32}, %arg34: memref<128xf32, "cuda"> {byre.argname = "Input34", byre.argtype = 1 : i32}, %arg35: memref<128xf32, "cuda"> {byre.argname = "Input35", byre.argtype = 1 : i32}, %arg36: memref<128xf32, "cuda"> {byre.argname = "Input36", byre.argtype = 1 : i32}, %arg37: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Input37", byre.argtype = 1 : i32}, %arg38: memref<128xf32, "cuda"> {byre.argname = "Input38", byre.argtype = 1 : i32}, %arg39: memref<128xf32, "cuda"> {byre.argname = "Input39", byre.argtype = 1 : i32}, %arg40: memref<128xf32, "cuda"> {byre.argname = "Input40", byre.argtype = 1 : i32}, %arg41: memref<128xf32, "cuda"> {byre.argname = "Input41", byre.argtype = 1 : i32}, %arg42: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input42", byre.argtype = 1 : i32}, %arg43: memref<128xf32, "cuda"> {byre.argname = "Input43", byre.argtype = 1 : i32}, %arg44: memref<128xf32, "cuda"> {byre.argname = "Input44", byre.argtype = 1 : i32}, %arg45: memref<128xf32, "cuda"> {byre.argname = "Input45", byre.argtype = 1 : i32}, %arg46: memref<128xf32, "cuda"> {byre.argname = "Input46", byre.argtype = 1 : i32}, %arg47: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Input47", byre.argtype = 1 : i32}, %arg48: memref<128xf32, "cuda"> {byre.argname = "Input48", byre.argtype = 1 : i32}, %arg49: memref<128xf32, "cuda"> {byre.argname = "Input49", byre.argtype = 1 : i32}, %arg50: memref<128xf32, "cuda"> {byre.argname = "Input50", byre.argtype = 1 : i32}, %arg51: memref<128xf32, "cuda"> {byre.argname = "Input51", byre.argtype = 1 : i32}, %arg52: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Input52", byre.argtype = 1 : i32}, %arg53: memref<256xf32, "cuda"> {byre.argname = "Input53", byre.argtype = 1 : i32}, %arg54: memref<256xf32, "cuda"> {byre.argname = "Input54", byre.argtype = 1 : i32}, %arg55: memref<256xf32, "cuda"> {byre.argname = "Input55", byre.argtype = 1 : i32}, %arg56: memref<256xf32, "cuda"> {byre.argname = "Input56", byre.argtype = 1 : i32}, %arg57: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input57", byre.argtype = 1 : i32}, %arg58: memref<256xf32, "cuda"> {byre.argname = "Input58", byre.argtype = 1 : i32}, %arg59: memref<256xf32, "cuda"> {byre.argname = "Input59", byre.argtype = 1 : i32}, %arg60: memref<256xf32, "cuda"> {byre.argname = "Input60", byre.argtype = 1 : i32}, %arg61: memref<256xf32, "cuda"> {byre.argname = "Input61", byre.argtype = 1 : i32}, %arg62: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Input62", byre.argtype = 1 : i32}, %arg63: memref<256xf32, "cuda"> {byre.argname = "Input63", byre.argtype = 1 : i32}, %arg64: memref<256xf32, "cuda"> {byre.argname = "Input64", byre.argtype = 1 : i32}, %arg65: memref<256xf32, "cuda"> {byre.argname = "Input65", byre.argtype = 1 : i32}, %arg66: memref<256xf32, "cuda"> {byre.argname = "Input66", byre.argtype = 1 : i32}, %arg67: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input67", byre.argtype = 1 : i32}, %arg68: memref<256xf32, "cuda"> {byre.argname = "Input68", byre.argtype = 1 : i32}, %arg69: memref<256xf32, "cuda"> {byre.argname = "Input69", byre.argtype = 1 : i32}, %arg70: memref<256xf32, "cuda"> {byre.argname = "Input70", byre.argtype = 1 : i32}, %arg71: memref<256xf32, "cuda"> {byre.argname = "Input71", byre.argtype = 1 : i32}, %arg72: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Input72", byre.argtype = 1 : i32}, %arg73: memref<256xf32, "cuda"> {byre.argname = "Input73", byre.argtype = 1 : i32}, %arg74: memref<256xf32, "cuda"> {byre.argname = "Input74", byre.argtype = 1 : i32}, %arg75: memref<256xf32, "cuda"> {byre.argname = "Input75", byre.argtype = 1 : i32}, %arg76: memref<256xf32, "cuda"> {byre.argname = "Input76", byre.argtype = 1 : i32}, %arg77: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Input77", byre.argtype = 1 : i32}, %arg78: memref<512xf32, "cuda"> {byre.argname = "Input78", byre.argtype = 1 : i32}, %arg79: memref<512xf32, "cuda"> {byre.argname = "Input79", byre.argtype = 1 : i32}, %arg80: memref<512xf32, "cuda"> {byre.argname = "Input80", byre.argtype = 1 : i32}, %arg81: memref<512xf32, "cuda"> {byre.argname = "Input81", byre.argtype = 1 : i32}, %arg82: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input82", byre.argtype = 1 : i32}, %arg83: memref<512xf32, "cuda"> {byre.argname = "Input83", byre.argtype = 1 : i32}, %arg84: memref<512xf32, "cuda"> {byre.argname = "Input84", byre.argtype = 1 : i32}, %arg85: memref<512xf32, "cuda"> {byre.argname = "Input85", byre.argtype = 1 : i32}, %arg86: memref<512xf32, "cuda"> {byre.argname = "Input86", byre.argtype = 1 : i32}, %arg87: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Input87", byre.argtype = 1 : i32}, %arg88: memref<512xf32, "cuda"> {byre.argname = "Input88", byre.argtype = 1 : i32}, %arg89: memref<512xf32, "cuda"> {byre.argname = "Input89", byre.argtype = 1 : i32}, %arg90: memref<512xf32, "cuda"> {byre.argname = "Input90", byre.argtype = 1 : i32}, %arg91: memref<512xf32, "cuda"> {byre.argname = "Input91", byre.argtype = 1 : i32}, %arg92: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input92", byre.argtype = 1 : i32}, %arg93: memref<512xf32, "cuda"> {byre.argname = "Input93", byre.argtype = 1 : i32}, %arg94: memref<512xf32, "cuda"> {byre.argname = "Input94", byre.argtype = 1 : i32}, %arg95: memref<512xf32, "cuda"> {byre.argname = "Input95", byre.argtype = 1 : i32}, %arg96: memref<512xf32, "cuda"> {byre.argname = "Input96", byre.argtype = 1 : i32}, %arg97: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Input97", byre.argtype = 1 : i32}, %arg98: memref<512xf32, "cuda"> {byre.argname = "Input98", byre.argtype = 1 : i32}, %arg99: memref<512xf32, "cuda"> {byre.argname = "Input99", byre.argtype = 1 : i32}, %arg100: memref<512xf32, "cuda"> {byre.argname = "Input100", byre.argtype = 1 : i32}, %arg101: memref<512xf32, "cuda"> {byre.argname = "Input101", byre.argtype = 1 : i32}, %arg102: memref<1000x512xf32, "cuda"> {byre.argname = "Input102", byre.argtype = 1 : i32}, %arg103: memref<1000xf32, "cuda"> {byre.argname = "Input103", byre.argtype = 1 : i32}, %arg104: memref<f32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg105: memref<64x3x7x7xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}, %arg106: memref<64xf32, "cuda"> {byre.argname = "Output2", byre.argtype = 2 : i32}, %arg107: memref<64xf32, "cuda"> {byre.argname = "Output3", byre.argtype = 2 : i32}, %arg108: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output4", byre.argtype = 2 : i32}, %arg109: memref<64xf32, "cuda"> {byre.argname = "Output5", byre.argtype = 2 : i32}, %arg110: memref<64xf32, "cuda"> {byre.argname = "Output6", byre.argtype = 2 : i32}, %arg111: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output7", byre.argtype = 2 : i32}, %arg112: memref<64xf32, "cuda"> {byre.argname = "Output8", byre.argtype = 2 : i32}, %arg113: memref<64xf32, "cuda"> {byre.argname = "Output9", byre.argtype = 2 : i32}, %arg114: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output10", byre.argtype = 2 : i32}, %arg115: memref<64xf32, "cuda"> {byre.argname = "Output11", byre.argtype = 2 : i32}, %arg116: memref<64xf32, "cuda"> {byre.argname = "Output12", byre.argtype = 2 : i32}, %arg117: memref<64x64x3x3xf32, "cuda"> {byre.argname = "Output13", byre.argtype = 2 : i32}, %arg118: memref<64xf32, "cuda"> {byre.argname = "Output14", byre.argtype = 2 : i32}, %arg119: memref<64xf32, "cuda"> {byre.argname = "Output15", byre.argtype = 2 : i32}, %arg120: memref<128x64x3x3xf32, "cuda"> {byre.argname = "Output16", byre.argtype = 2 : i32}, %arg121: memref<128xf32, "cuda"> {byre.argname = "Output17", byre.argtype = 2 : i32}, %arg122: memref<128xf32, "cuda"> {byre.argname = "Output18", byre.argtype = 2 : i32}, %arg123: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output19", byre.argtype = 2 : i32}, %arg124: memref<128xf32, "cuda"> {byre.argname = "Output20", byre.argtype = 2 : i32}, %arg125: memref<128xf32, "cuda"> {byre.argname = "Output21", byre.argtype = 2 : i32}, %arg126: memref<128x64x1x1xf32, "cuda"> {byre.argname = "Output22", byre.argtype = 2 : i32}, %arg127: memref<128xf32, "cuda"> {byre.argname = "Output23", byre.argtype = 2 : i32}, %arg128: memref<128xf32, "cuda"> {byre.argname = "Output24", byre.argtype = 2 : i32}, %arg129: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output25", byre.argtype = 2 : i32}, %arg130: memref<128xf32, "cuda"> {byre.argname = "Output26", byre.argtype = 2 : i32}, %arg131: memref<128xf32, "cuda"> {byre.argname = "Output27", byre.argtype = 2 : i32}, %arg132: memref<128x128x3x3xf32, "cuda"> {byre.argname = "Output28", byre.argtype = 2 : i32}, %arg133: memref<128xf32, "cuda"> {byre.argname = "Output29", byre.argtype = 2 : i32}, %arg134: memref<128xf32, "cuda"> {byre.argname = "Output30", byre.argtype = 2 : i32}, %arg135: memref<256x128x3x3xf32, "cuda"> {byre.argname = "Output31", byre.argtype = 2 : i32}, %arg136: memref<256xf32, "cuda"> {byre.argname = "Output32", byre.argtype = 2 : i32}, %arg137: memref<256xf32, "cuda"> {byre.argname = "Output33", byre.argtype = 2 : i32}, %arg138: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output34", byre.argtype = 2 : i32}, %arg139: memref<256xf32, "cuda"> {byre.argname = "Output35", byre.argtype = 2 : i32}, %arg140: memref<256xf32, "cuda"> {byre.argname = "Output36", byre.argtype = 2 : i32}, %arg141: memref<256x128x1x1xf32, "cuda"> {byre.argname = "Output37", byre.argtype = 2 : i32}, %arg142: memref<256xf32, "cuda"> {byre.argname = "Output38", byre.argtype = 2 : i32}, %arg143: memref<256xf32, "cuda"> {byre.argname = "Output39", byre.argtype = 2 : i32}, %arg144: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output40", byre.argtype = 2 : i32}, %arg145: memref<256xf32, "cuda"> {byre.argname = "Output41", byre.argtype = 2 : i32}, %arg146: memref<256xf32, "cuda"> {byre.argname = "Output42", byre.argtype = 2 : i32}, %arg147: memref<256x256x3x3xf32, "cuda"> {byre.argname = "Output43", byre.argtype = 2 : i32}, %arg148: memref<256xf32, "cuda"> {byre.argname = "Output44", byre.argtype = 2 : i32}, %arg149: memref<256xf32, "cuda"> {byre.argname = "Output45", byre.argtype = 2 : i32}, %arg150: memref<512x256x3x3xf32, "cuda"> {byre.argname = "Output46", byre.argtype = 2 : i32}, %arg151: memref<512xf32, "cuda"> {byre.argname = "Output47", byre.argtype = 2 : i32}, %arg152: memref<512xf32, "cuda"> {byre.argname = "Output48", byre.argtype = 2 : i32}, %arg153: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output49", byre.argtype = 2 : i32}, %arg154: memref<512xf32, "cuda"> {byre.argname = "Output50", byre.argtype = 2 : i32}, %arg155: memref<512xf32, "cuda"> {byre.argname = "Output51", byre.argtype = 2 : i32}, %arg156: memref<512x256x1x1xf32, "cuda"> {byre.argname = "Output52", byre.argtype = 2 : i32}, %arg157: memref<512xf32, "cuda"> {byre.argname = "Output53", byre.argtype = 2 : i32}, %arg158: memref<512xf32, "cuda"> {byre.argname = "Output54", byre.argtype = 2 : i32}, %arg159: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output55", byre.argtype = 2 : i32}, %arg160: memref<512xf32, "cuda"> {byre.argname = "Output56", byre.argtype = 2 : i32}, %arg161: memref<512xf32, "cuda"> {byre.argname = "Output57", byre.argtype = 2 : i32}, %arg162: memref<512x512x3x3xf32, "cuda"> {byre.argname = "Output58", byre.argtype = 2 : i32}, %arg163: memref<512xf32, "cuda"> {byre.argname = "Output59", byre.argtype = 2 : i32}, %arg164: memref<512xf32, "cuda"> {byre.argname = "Output60", byre.argtype = 2 : i32}, %arg165: memref<1000x512xf32, "cuda"> {byre.argname = "Output61", byre.argtype = 2 : i32}, %arg166: memref<1000xf32, "cuda"> {byre.argname = "Output62", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
-    %alloc = memref.alloc() : memref<76022848xi8, "cuda">
-    %0 = "byre.alias"(%alloc) {device = "cuda", offset = 8012864 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
-    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 128 : i32, GridSize.x = 4704 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
-    %1 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %2 = "byre.alias"(%alloc) {device = "cuda", offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %alloc = memref.alloc() : memref<76533504xi8, "cuda">
+    %0 = "byre.alias"(%alloc) <{offset = 75329280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x3x224x224xf16, "cuda">
+    byre.compute @PTXOp(%arg0, %0) {BlockSize.x = 256 : i32, GridSize.x = 588 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown0", memory_effects = [1 : i32, 2 : i32]} : memref<4x3x224x224xf32, "cuda">, memref<4x3x224x224xf16, "cuda">
+    %1 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
+    byre.compute @PTXOp(%arg2, %1) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown1", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf32, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %2 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%0, %1, %2) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<64x3x7x7xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %3 = "byre.alias"(%alloc) {device = "cuda", offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %3 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%2, %arg3, %arg4, %3) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %4 = "byre.alias"(%alloc) {device = "cuda", offset = 5080128 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %5 = "byre.alias"(%alloc) {device = "cuda", offset = 5006400 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown4", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %6 = "byre.alias"(%alloc) {device = "cuda", offset = 1552384 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown5", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %7 = "byre.alias"(%alloc) {device = "cuda", offset = 5153856 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown6", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
-    %8 = "byre.alias"(%alloc) {device = "cuda", offset = 4247104 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
-    %9 = "byre.alias"(%alloc) {device = "cuda", offset = 602112 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
-    %10 = "byre.alias"(%alloc) {device = "cuda", offset = 2383872 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %11 = "byre.alias"(%alloc) {device = "cuda", offset = 2088960 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown10", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %12 = "byre.alias"(%alloc) {device = "cuda", offset = 2678784 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown11", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %13 = "byre.alias"(%alloc) {device = "cuda", offset = 4940864 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
-    %14 = "byre.alias"(%alloc) {device = "cuda", offset = 60228672 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
-    %15 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %16 = "byre.alias"(%alloc) {device = "cuda", offset = 18850880 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown15", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %17 = "byre.alias"(%alloc) {device = "cuda", offset = 6833216 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown16", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
-    %18 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %19 = "byre.alias"(%alloc) {device = "cuda", offset = 21636160 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
-    %20 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %21 = "byre.alias"(%alloc) {device = "cuda", offset = 33432640 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown20", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %22 = "byre.alias"(%alloc) {device = "cuda", offset = 28714048 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown21", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
-    %23 = "byre.alias"(%alloc) {device = "cuda", offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
-    %24 = "byre.alias"(%alloc) {device = "cuda", offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
-    %25 = "byre.alias"(%alloc) {device = "cuda", offset = 757568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%23, %25) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %26 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    %27 = "byre.alias"(%alloc) {device = "cuda", offset = 59827264 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
-    byre.compute @PTXOp(%3, %26, %27) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
-    %28 = "byre.alias"(%alloc) {device = "cuda", offset = 5227584 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxOp_f16_f16(%26, %28) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %29 = "byre.alias"(%alloc) {device = "cuda", offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%28, %4, %29) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %30 = "byre.alias"(%alloc) {device = "cuda", offset = 44573760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%29, %arg8, %arg9, %30) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %31 = "byre.alias"(%alloc) {device = "cuda", offset = 17245248 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %32 = "byre.alias"(%alloc) {device = "cuda", offset = 301056 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%30, %31, %32) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %33 = "byre.alias"(%alloc) {device = "cuda", offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%31, %5, %33) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %34 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%33, %arg13, %arg14, %34) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %35 = "byre.alias"(%alloc) {device = "cuda", offset = 501760 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%34, %28, %30, %35) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %36 = "byre.alias"(%alloc) {device = "cuda", offset = 14033984 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%30, %6, %36) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %37 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%36, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %38 = "byre.alias"(%alloc) {device = "cuda", offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %39 = "byre.alias"(%alloc) {device = "cuda", offset = 200704 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %40 = "byre.alias"(%alloc) {device = "cuda", offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %4 = "byre.alias"(%alloc) <{offset = 5545728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg7, %4) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %5 = "byre.alias"(%alloc) <{offset = 5361664 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg12, %5) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %6 = "byre.alias"(%alloc) <{offset = 6283008 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg17, %6) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %7 = "byre.alias"(%alloc) <{offset = 6209280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg22, %7) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown3", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf32, "cuda">, memref<64x64x3x3xf16, "cuda">
+    %8 = "byre.alias"(%alloc) <{offset = 5463808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg37, %8) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown7", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf32, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %9 = "byre.alias"(%alloc) <{offset = 6557440 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg27, %9) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown8", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf32, "cuda">, memref<128x64x3x3xf16, "cuda">
+    %10 = "byre.alias"(%alloc) <{offset = 2256896 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg32, %10) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %11 = "byre.alias"(%alloc) <{offset = 1761280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg42, %11) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %12 = "byre.alias"(%alloc) <{offset = 0 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg47, %12) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown9", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf32, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %13 = "byre.alias"(%alloc) <{offset = 5480192 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg62, %13) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown12", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf32, "cuda">, memref<256x128x1x1xf16, "cuda">
+    %14 = "byre.alias"(%alloc) <{offset = 5619456 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg52, %14) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown13", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf32, "cuda">, memref<256x128x3x3xf16, "cuda">
+    %15 = "byre.alias"(%alloc) <{offset = 74149632 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg57, %15) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %16 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg67, %16) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %17 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg72, %17) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown14", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf32, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %18 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @PTXOp(%arg87, %18) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown17", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf32, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %19 = "byre.alias"(%alloc) <{offset = 23162624 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg77, %19) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown18", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf32, "cuda">, memref<512x256x3x3xf16, "cuda">
+    %20 = "byre.alias"(%alloc) <{offset = 28733184 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg82, %20) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %21 = "byre.alias"(%alloc) <{offset = 33451776 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg92, %21) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %22 = "byre.alias"(%alloc) <{offset = 38170368 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%arg97, %22) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown19", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf32, "cuda">, memref<512x512x3x3xf16, "cuda">
+    %23 = "byre.alias"(%alloc) <{offset = 5439616 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%arg1, %23) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown22", memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">
+    %24 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%arg102, %24) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown23", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf32, "cuda">, memref<1000x512xf16, "cuda">
+    %25 = "byre.alias"(%alloc) <{offset = 73993984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000xf16, "cuda">
+    byre.compute @PTXOp(%arg103, %25) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown24", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf16, "cuda">
+    %26 = "byre.alias"(%alloc) <{offset = 5435392 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%23, %26) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown25_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %27 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
+    %28 = "byre.alias"(%alloc) <{offset = 25521920 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x112x112xi1, "cuda">
+    byre.compute @PTXOp(%3, %27, %28) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown26", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xi1, "cuda">
+    %29 = "byre.alias"(%alloc) <{offset = 15134464 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxOp_f16_f16(%27, %29) {base_dilations = dense<1> : tensor<4xi64>, device = "cuda", memory_effects = [1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = dense<1> : tensor<4xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %30 = "byre.alias"(%alloc) <{offset = 16740096 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%29, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %31 = "byre.alias"(%alloc) <{offset = 42888960 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%30, %arg8, %arg9, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %32 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %33 = "byre.alias"(%alloc) <{offset = 69381888 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %32, %33) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %34 = "byre.alias"(%alloc) <{offset = 7106304 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%32, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%34, %arg13, %arg14, %31) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %35 = "byre.alias"(%alloc) <{offset = 18345728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %36 = "byre.alias"(%alloc) <{offset = 70987520 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%31, %29, %35, %36) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%35, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %37 = "byre.alias"(%alloc) <{offset = 44494592 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%31, %arg18, %arg19, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %38 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    %39 = "byre.alias"(%alloc) <{offset = 57339648 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%37, %38, %39) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown28", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %40 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%38, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %37) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %41 = "byre.alias"(%alloc) {device = "cuda", offset = 10822720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    %42 = "byre.alias"(%alloc) {device = "cuda", offset = 401408 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
-    byre.compute @PTXOp(%37, %30, %41, %42) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown32", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
-    %43 = "byre.alias"(%alloc) {device = "cuda", offset = 61621312 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %44 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %41 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%40, %arg23, %arg24, %41) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %42 = "byre.alias"(%alloc) <{offset = 70184704 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xi1, "cuda">
+    byre.compute @PTXOp(%41, %35, %37, %42) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown30", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">
+    %43 = "byre.alias"(%alloc) <{offset = 58142464 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %8, %43) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %44 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%43, %arg38, %arg39, %44) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %45 = "byre.alias"(%alloc) {device = "cuda", offset = 70452288 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%41, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %46 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %45 = "byre.alias"(%alloc) <{offset = 59748096 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%37, %9, %45) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %46 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%45, %arg28, %arg29, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %47 = "byre.alias"(%alloc) {device = "cuda", offset = 71255104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %48 = "byre.alias"(%alloc) {device = "cuda", offset = 4740160 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown35", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %49 = "byre.alias"(%alloc) {device = "cuda", offset = 72057920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %47 = "byre.alias"(%alloc) <{offset = 60550912 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %48 = "byre.alias"(%alloc) <{offset = 3756032 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %47, %48) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %49 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%47, %10, %49) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%49, %arg33, %arg34, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %50 = "byre.alias"(%alloc) {device = "cuda", offset = 69649472 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %51 = "byre.alias"(%alloc) {device = "cuda", offset = 4790336 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %52 = "byre.alias"(%alloc) {device = "cuda", offset = 60818496 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %50 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %51 = "byre.alias"(%alloc) <{offset = 3354624 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %44, %50, %51) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %52 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%50, %11, %52) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%52, %arg43, %arg44, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %53 = "byre.alias"(%alloc) {device = "cuda", offset = 57418816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %54 = "byre.alias"(%alloc) {device = "cuda", offset = 4840512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %55 = "byre.alias"(%alloc) {device = "cuda", offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %53 = "byre.alias"(%alloc) <{offset = 58945280 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %54 = "byre.alias"(%alloc) <{offset = 2953216 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %53, %54) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown37", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %55 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvOp_f16f16_f16(%53, %12, %55) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
     byre.compute @BatchNormTrainingOp_f16f32f32_f16(%55, %arg48, %arg49, %46) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %56 = "byre.alias"(%alloc) {device = "cuda", offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    %57 = "byre.alias"(%alloc) {device = "cuda", offset = 4890688 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
-    byre.compute @PTXOp(%46, %50, %56, %57) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown41", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
-    %58 = "byre.alias"(%alloc) {device = "cuda", offset = 2973696 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %13, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %59 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%58, %arg63, %arg64, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %60 = "byre.alias"(%alloc) {device = "cuda", offset = 59024448 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%56, %14, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %61 = "byre.alias"(%alloc) {device = "cuda", offset = 46580800 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg53, %arg54, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %62 = "byre.alias"(%alloc) {device = "cuda", offset = 58623040 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %63 = "byre.alias"(%alloc) {device = "cuda", offset = 4263488 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %62, %63) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown44", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %64 = "byre.alias"(%alloc) {device = "cuda", offset = 58221632 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%62, %15, %64) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%64, %arg58, %arg59, %61) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %65 = "byre.alias"(%alloc) {device = "cuda", offset = 4338752 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %66 = "byre.alias"(%alloc) {device = "cuda", offset = 4288576 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%61, %59, %65, %66) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %67 = "byre.alias"(%alloc) {device = "cuda", offset = 3776512 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%65, %16, %67) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%67, %arg68, %arg69, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %68 = "byre.alias"(%alloc) {device = "cuda", offset = 3375104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %69 = "byre.alias"(%alloc) {device = "cuda", offset = 4313664 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %68, %69) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %70 = "byre.alias"(%alloc) {device = "cuda", offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%68, %17, %70) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%70, %arg73, %arg74, %59) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %71 = "byre.alias"(%alloc) {device = "cuda", offset = 75244608 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    %72 = "byre.alias"(%alloc) {device = "cuda", offset = 4177920 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
-    byre.compute @PTXOp(%59, %65, %71, %72) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown50", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
-    %73 = "byre.alias"(%alloc) {device = "cuda", offset = 950272 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %18, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %74 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%73, %arg88, %arg89, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %75 = "byre.alias"(%alloc) {device = "cuda", offset = 1150976 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%71, %19, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %76 = "byre.alias"(%alloc) {device = "cuda", offset = 46179392 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg78, %arg79, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %77 = "byre.alias"(%alloc) {device = "cuda", offset = 1351680 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %78 = "byre.alias"(%alloc) {device = "cuda", offset = 59688000 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %77, %78) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown53", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %79 = "byre.alias"(%alloc) {device = "cuda", offset = 0 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%77, %20, %79) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%79, %arg83, %arg84, %76) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %80 = "byre.alias"(%alloc) {device = "cuda", offset = 1626112 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %81 = "byre.alias"(%alloc) {device = "cuda", offset = 59700544 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%76, %74, %80, %81) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%80, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %82 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%76, %arg93, %arg94, %82) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %83 = "byre.alias"(%alloc) {device = "cuda", offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    %84 = "byre.alias"(%alloc) {device = "cuda", offset = 59713088 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%82, %83, %84) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %85 = "byre.alias"(%alloc) {device = "cuda", offset = 75646016 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvOp_f16f16_f16(%83, %22, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%85, %arg98, %arg99, %74) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %86 = "byre.alias"(%alloc) {device = "cuda", offset = 75846720 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
-    byre.compute @PTXOp(%74, %80, %82, %86) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown59", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
-    %87 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%82, %87) {device = "cuda", dimensions = dense<[3, 2]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512xf16, "cuda">
-    %88 = "byre.alias"(%alloc) {device = "cuda", offset = 4203008 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @PTXOp(%87, %88) {BlockSize.x = 128 : i32, GridSize.x = 16 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown60", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %89 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %24, %89) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %90 = "byre.alias"(%alloc) {device = "cuda", offset = 25019456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%arg103, %89, %90) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown61", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %91 = "byre.alias"(%alloc) {device = "cuda", offset = 25027456 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceMaxOp_f16_f16(%90, %91) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %92 = "byre.alias"(%alloc) {device = "cuda", offset = 42869824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    byre.compute @PTXOp(%91, %90, %92, %89) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown62", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
-    %93 = "byre.alias"(%alloc) {device = "cuda", offset = 42877824 : i64} : (memref<76022848xi8, "cuda">) -> memref<4xf16, "cuda">
-    byre.compute @ReduceSumOp_f16_f16(%89, %93) {device = "cuda", dimensions = dense<1> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
-    %94 = "byre.alias"(%alloc) {device = "cuda", offset = 4207104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf16, "cuda">
-    %95 = "byre.alias"(%alloc) {device = "cuda", offset = 4215104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    %96 = "byre.alias"(%alloc) {device = "cuda", offset = 4231104 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x1000xf32, "cuda">
-    byre.compute @PTXOp(%93, %92, %25, %23, %arg1, %94, %95, %96) {BlockSize.x = 128 : i32, GridSize.x = 32 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf32, "cuda">, memref<4x1000xf32, "cuda">
-    %97 = "byre.alias"(%alloc) {device = "cuda", offset = 46380096 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%94, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
-    %98 = "byre.alias"(%alloc) {device = "cuda", offset = 749568 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @PTXOp(%97, %86, %98) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%85, %arg98, %98, %82, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%82, %22, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%83, %82, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%84, %74, %82) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%76, %arg93, %82, %74, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%74, %21, %76) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %99 = "byre.alias"(%alloc) {device = "cuda", offset = 23995456 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x512x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%80, %74, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%98, %76, %81, %83) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown72", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%79, %arg83, %83, %76, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%76, %20, %85) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%77, %76, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
-    byre.compute @PTXOp(%78, %85, %76) {BlockSize.x = 128 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown76", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
-    %100 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg78, %76, %100, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %19, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%73, %arg88, %83, %100, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%100, %18, %61) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %101 = "byre.alias"(%alloc) {device = "cuda", offset = 1826816 : i64} : (memref<76022848xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%71, %100, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
-    %102 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @PTXOp(%61, %59, %72, %102) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown83", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%70, %arg73, %102, %59, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %103 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %17, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    %104 = "byre.alias"(%alloc) {device = "cuda", offset = 74843200 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%68, %59, %104) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%69, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown87", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%67, %arg68, %59, %103, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %16, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%65, %103, %16) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%102, %59, %66, %65) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown91", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%64, %arg58, %65, %59, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%59, %15, %103) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%62, %59, %17) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
-    byre.compute @PTXOp(%63, %103, %59) {BlockSize.x = 128 : i32, GridSize.x = 1568 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown95", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg53, %59, %103, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%58, %arg63, %65, %103, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
-    %105 = "byre.alias"(%alloc) {device = "cuda", offset = 46982208 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%103, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %106 = "byre.alias"(%alloc) {device = "cuda", offset = 59425856 : i64} : (memref<76022848xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%56, %103, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
-    byre.compute @PTXOp(%105, %46, %57, %56) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown102", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %56, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %107 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    %56 = "byre.alias"(%alloc) <{offset = 4960256 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xi1, "cuda">
+    byre.compute @PTXOp(%46, %50, %44, %56) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown39", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">
+    %57 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %13, %57) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %58 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%57, %arg63, %arg64, %58) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %59 = "byre.alias"(%alloc) <{offset = 46100224 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%44, %14, %59) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %60 = "byre.alias"(%alloc) <{offset = 46501632 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%59, %arg53, %arg54, %60) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %61 = "byre.alias"(%alloc) <{offset = 2551808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %62 = "byre.alias"(%alloc) <{offset = 6704896 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%60, %61, %62) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%61, %15, %60) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %63 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%60, %arg58, %arg59, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %64 = "byre.alias"(%alloc) <{offset = 4157440 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %65 = "byre.alias"(%alloc) <{offset = 6905600 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %58, %64, %65) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %66 = "byre.alias"(%alloc) <{offset = 22736640 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%64, %16, %66) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%66, %arg68, %arg69, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %67 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %68 = "byre.alias"(%alloc) <{offset = 2056192 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %67, %68) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown46", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %69 = "byre.alias"(%alloc) <{offset = 9891584 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%67, %17, %69) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%69, %arg73, %arg74, %63) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %70 = "byre.alias"(%alloc) <{offset = 72191744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    %71 = "byre.alias"(%alloc) <{offset = 294912 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xi1, "cuda">
+    byre.compute @PTXOp(%63, %64, %70, %71) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown48", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">
+    %72 = "byre.alias"(%alloc) <{offset = 495616 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %18, %72) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %73 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%72, %arg88, %arg89, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %74 = "byre.alias"(%alloc) <{offset = 696320 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%70, %19, %74) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %75 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%74, %arg78, %arg79, %75) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %76 = "byre.alias"(%alloc) <{offset = 897024 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %77 = "byre.alias"(%alloc) <{offset = 6457088 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%75, %76, %77) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%76, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %78 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%75, %arg83, %arg84, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %79 = "byre.alias"(%alloc) <{offset = 1097728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %80 = "byre.alias"(%alloc) <{offset = 4820992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %73, %79, %80) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %81 = "byre.alias"(%alloc) <{offset = 1298432 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%79, %21, %81) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%81, %arg93, %arg94, %78) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %82 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    %83 = "byre.alias"(%alloc) <{offset = 6356736 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%78, %82, %83) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown55", memory_effects = [1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %84 = "byre.alias"(%alloc) <{offset = 72593152 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvOp_f16f16_f16(%82, %22, %84) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", lhs_dilation = dense<1> : tensor<2xi64>, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<2x2xi64>, rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormTrainingOp_f16f32f32_f16(%84, %arg98, %arg99, %73) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">
+    %85 = "byre.alias"(%alloc) <{offset = 72793856 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xi1, "cuda">
+    byre.compute @PTXOp(%73, %79, %78, %85) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown57", memory_effects = [1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">
+    %86 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<2048x49xf16, "cuda">
+    %87 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<2048xf16, "cuda">
+    byre.compute @PTXOp(%86, %87) {BlockSize.x = 64 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 2048 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown62_kernel"} : memref<2048x49xf16, "cuda">, memref<2048xf16, "cuda">
+    %88 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    %89 = "byre.alias"(%alloc) <{offset = 5435520 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @PTXOp(%88, %89) {BlockSize.x = 256 : i32, GridSize.x = 2 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown63", memory_effects = [1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %90 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %24, %90) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 1 : i64} : memref<4x512xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %91 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%25, %90, %91) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown64", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %92 = "byre.alias"(%alloc) <{offset = 11529856 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%91, %92) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown65_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    byre.compute @PTXOp(%92, %91, %90) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown66", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %93 = "byre.alias"(%alloc) <{offset = 47111808 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%90, %93) {BlockSize.x = 512 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 4 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown67_kernel"} : memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">
+    %94 = "byre.alias"(%alloc) <{offset = 11521792 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4xf16, "cuda">
+    byre.compute @PTXOp(%93, %94) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown68", memory_effects = [1 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4xf16, "cuda">
+    %95 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    %96 = "byre.alias"(%alloc) <{offset = 5455744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x1000xf16, "cuda">
+    byre.compute @PTXOp(%94, %90, %26, %23, %95, %96) {BlockSize.x = 256 : i32, GridSize.x = 4 : i32, arg_ranks = [1 : i32, 2 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown69", memory_effects = [1 : i32, 1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32]} : memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<4x1000xf16, "cuda">
+    %97 = "byre.alias"(%alloc) <{offset = 47103744 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%96, %24, %97) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">, memref<4x512xf16, "cuda">
+    %98 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x512x7x7xf16, "cuda">
+    byre.compute @PTXOp(%97, %85, %98) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [2 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown70", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%84, %arg98, %98, %78, %arg163, %arg164) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%78, %22, %73) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%82, %78, %22) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%83, %73, %78) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%81, %arg93, %78, %73, %arg160, %arg161) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %21, %78) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%79, %73, %21) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%98, %78, %80, %84) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown78", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%75, %arg83, %84, %73, %arg154, %arg155) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%73, %20, %75) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%76, %73, %20) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x512x3x3xf16, "cuda">
+    byre.compute @PTXOp(%77, %75, %73) {BlockSize.x = 256 : i32, GridSize.x = 98 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown74", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x512x7x7xi1, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%74, %arg78, %73, %75, %arg151, %arg152) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%75, %19, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %75, %19) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%72, %arg88, %84, %98, %arg157, %arg158) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512xf32, "cuda">, memref<512xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%98, %18, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %99 = "byre.alias"(%alloc) <{offset = 1499136 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<512x256x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%70, %98, %99) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x512x7x7xf16, "cuda">, memref<512x256x1x1xf16, "cuda">
+    %100 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%63, %58, %71, %100) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%69, %arg73, %100, %63, %arg148, %arg149) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%63, %17, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %101 = "byre.alias"(%alloc) <{offset = 72969984 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%67, %63, %101) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%68, %58, %63) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%66, %arg68, %63, %58, %arg145, %arg146) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %16, %63) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %102 = "byre.alias"(%alloc) <{offset = 71790336 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x256x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%64, %58, %102) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    %103 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @PTXOp(%100, %63, %65, %103) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown89", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">
+    %104 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%60, %arg58, %103, %104, %arg139, %arg140) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %15, %58) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%61, %104, %15) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x256x3x3xf16, "cuda">
+    byre.compute @PTXOp(%62, %58, %60) {BlockSize.x = 256 : i32, GridSize.x = 196 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown93", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x256x14x14xi1, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%59, %arg53, %60, %58, %arg136, %arg137) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%58, %14, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %58, %14) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%57, %arg63, %103, %104, %arg142, %arg143) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256xf32, "cuda">, memref<256xf32, "cuda">
+    %105 = "byre.alias"(%alloc) <{offset = 46903040 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%104, %13, %105) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %106 = "byre.alias"(%alloc) <{offset = 4558848 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<256x128x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%44, %104, %106) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x256x14x14xf16, "cuda">, memref<256x128x1x1xf16, "cuda">
+    byre.compute @PTXOp(%105, %46, %56, %44) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%55, %arg48, %44, %46, %arg133, %arg134) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %107 = "byre.alias"(%alloc) <{offset = 11120384 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
     byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %12, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %108 = "byre.alias"(%alloc) {device = "cuda", offset = 68846656 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    %108 = "byre.alias"(%alloc) <{offset = 56536832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%53, %46, %108) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    %109 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
-    byre.compute @PTXOp(%54, %107, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown106", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %109, %46, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %11, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %110 = "byre.alias"(%alloc) {device = "cuda", offset = 69141568 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %46, %110) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%56, %107, %51, %109) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown110", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %109, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %107) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    %111 = "byre.alias"(%alloc) {device = "cuda", offset = 72860736 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @PTXOp(%54, %107, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%52, %arg43, %46, %107, %arg130, %arg131) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %11, %46) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %109 = "byre.alias"(%alloc) <{offset = 61353728 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%50, %107, %109) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
+    %110 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @PTXOp(%44, %46, %51, %110) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown108", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%49, %arg33, %110, %46, %arg124, %arg125) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %10, %44) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %111 = "byre.alias"(%alloc) <{offset = 55734016 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x128x3x3xf16, "cuda">
     byre.compute @ConvBackwardFilterOp_f16f16_f16(%47, %46, %111) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x128x3x3xf16, "cuda">
-    byre.compute @PTXOp(%48, %107, %46) {BlockSize.x = 128 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown114", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %107, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %9, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %112 = "byre.alias"(%alloc) {device = "cuda", offset = 73155648 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %112) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %109, %107, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
-    %113 = "byre.alias"(%alloc) {device = "cuda", offset = 47785024 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%107, %8, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %114 = "byre.alias"(%alloc) {device = "cuda", offset = 73663552 : i64} : (memref<76022848xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%41, %107, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
-    byre.compute @PTXOp(%113, %37, %42, %41) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown121", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %115 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %41, %115, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %7, %37) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %116 = "byre.alias"(%alloc) {device = "cuda", offset = 9217088 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %115, %116) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%39, %37, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown125", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%36, %arg18, %115, %37, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %6, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %117 = "byre.alias"(%alloc) {device = "cuda", offset = 12428352 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%30, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%41, %115, %35, %36) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown129", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%33, %arg13, %36, %30, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%30, %5, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %118 = "byre.alias"(%alloc) {device = "cuda", offset = 15639616 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%31, %30, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%32, %115, %30) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown133", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%29, %arg8, %30, %115, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    byre.compute @ConvBackwardDataOp_f16f16_f16(%115, %4, %30) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    %119 = "byre.alias"(%alloc) {device = "cuda", offset = 20030528 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%28, %115, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
-    byre.compute @PTXOp(%36, %30, %115) {BlockSize.x = 128 : i32, GridSize.x = 6272 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown137", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
-    byre.compute @PoolMaxGradOp_f16f16_f16(%26, %115, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    %120 = "byre.alias"(%alloc) {device = "cuda", offset = 38151232 : i64} : (memref<76022848xi8, "cuda">) -> memref<4x64x112x112xf16, "cuda">
-    byre.compute @PTXOp(%27, %3, %120) {BlockSize.x = 128 : i32, GridSize.x = 25088 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown138", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
-    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %120, %26, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
-    %121 = "byre.alias"(%alloc) {device = "cuda", offset = 50996288 : i64} : (memref<76022848xi8, "cuda">) -> memref<64x3x7x7xf16, "cuda">
-    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %26, %121) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
-    %122 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<f32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%95, %122) {device = "cuda", dimensions = dense<[0, 1]> : tensor<2xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%122, %arg104) {BlockSize.x = 128 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], device = "cuda", kernel_name = "Unknown141", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
-    byre.compute @PTXOp(%121, %arg105) {BlockSize.x = 128 : i32, GridSize.x = 74 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown142", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
-    byre.compute @PTXOp(%119, %arg108) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown143", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%118, %arg111) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown144", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%117, %arg114) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown145", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%116, %arg117) {BlockSize.x = 128 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown146", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%112, %arg120) {BlockSize.x = 128 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown147", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
-    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%114, %arg126) {BlockSize.x = 128 : i32, GridSize.x = 64 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
-    byre.compute @PTXOp(%110, %arg129) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 128 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown151", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 128 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown152", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
-    byre.compute @PTXOp(%17, %arg138) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown153", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 128 : i32, GridSize.x = 256 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
-    byre.compute @PTXOp(%16, %arg144) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%104, %arg147) {BlockSize.x = 128 : i32, GridSize.x = 4608 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 128 : i32, GridSize.x = 9216 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown157", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
-    byre.compute @PTXOp(%21, %arg153) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown158", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%101, %arg156) {BlockSize.x = 128 : i32, GridSize.x = 1024 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
-    byre.compute @PTXOp(%99, %arg159) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 128 : i32, GridSize.x = 18432 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
-    %123 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000x512xf16, "cuda">
-    byre.compute @MatmulOp_f16f16_f16(%88, %94, %123) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
-    byre.compute @PTXOp(%123, %arg165) {BlockSize.x = 128 : i32, GridSize.x = 4000 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown163", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
-    %124 = "byre.alias"(%alloc) {device = "cuda", offset = 62424128 : i64} : (memref<76022848xi8, "cuda">) -> memref<1000xf32, "cuda">
-    byre.compute @ReduceSumOp_f32_f32(%96, %124) {device = "cuda", dimensions = dense<0> : tensor<1xi64>, memory_effects = [1 : i32, 2 : i32]} : memref<4x1000xf32, "cuda">, memref<1000xf32, "cuda">
-    byre.compute @PTXOp(%124, %arg166) {BlockSize.x = 128 : i32, GridSize.x = 8 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%48, %44, %46) {BlockSize.x = 256 : i32, GridSize.x = 392 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown112", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x128x28x28xi1, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">
+    %112 = "byre.alias"(%alloc) <{offset = 9514752 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x128x28x28xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%45, %arg28, %46, %112, %arg121, %arg122) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%112, %9, %41) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %113 = "byre.alias"(%alloc) <{offset = 56028928 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %112, %113) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x3x3xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%43, %arg38, %110, %46, %arg127, %arg128) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128xf32, "cuda">, memref<128xf32, "cuda">
+    %114 = "byre.alias"(%alloc) <{offset = 8711936 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%46, %8, %114) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %115 = "byre.alias"(%alloc) <{offset = 62156544 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<128x64x1x1xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%37, %46, %115) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<0> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x128x28x28xf16, "cuda">, memref<128x64x1x1xf16, "cuda">
+    %116 = "byre.alias"(%alloc) <{offset = 21556992 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<4x64x56x56xf16, "cuda">
+    byre.compute @PTXOp(%114, %41, %42, %116) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%40, %arg23, %116, %37, %arg118, %arg119) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%37, %7, %40) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %117 = "byre.alias"(%alloc) <{offset = 10317568 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%38, %37, %117) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%39, %40, %37) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%31, %arg18, %37, %38, %arg115, %arg116) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%38, %6, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %118 = "byre.alias"(%alloc) <{offset = 11923200 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%35, %38, %118) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%116, %31, %36, %35) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown127", memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%34, %arg13, %35, %31, %arg112, %arg113) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%31, %5, %34) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %119 = "byre.alias"(%alloc) <{offset = 13528832 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%32, %31, %119) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%33, %34, %31) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown131", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xi1, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%30, %arg8, %31, %34, %arg109, %arg110) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardDataOp_f16f16_f16(%34, %4, %31) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    %120 = "byre.alias"(%alloc) <{offset = 19951360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<64x64x3x3xf16, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%29, %34, %120) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<1> : tensor<4xi64>, window_strides = dense<1> : tensor<2xi64>} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<64x64x3x3xf16, "cuda">
+    byre.compute @PTXOp(%35, %31, %34) {BlockSize.x = 256 : i32, GridSize.x = 784 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown143", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x56x56xf16, "cuda">
+    byre.compute @PoolMaxGradOp_f16f16_f16(%27, %34, %3) {device = "cuda", memory_effects = [1 : i32, 1 : i32, 2 : i32], padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 1, 3, 3]> : tensor<4xi64>, window_strides = dense<[1, 1, 2, 2]> : tensor<4xi64>} : memref<4x64x112x112xf16, "cuda">, memref<4x64x56x56xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @PTXOp(%28, %3, %27) {BlockSize.x = 256 : i32, GridSize.x = 3136 : i32, arg_ranks = [4 : i32, 4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown144", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<4x64x112x112xi1, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">
+    byre.compute @BatchNormGradOp_f16f32f16_f16f32f32(%2, %arg3, %27, %3, %arg106, %arg107) {device = "cuda", epsilon = 9.99999974E-6 : f32, feature_index = 1 : i64, memory_effects = [1 : i32, 1 : i32, 1 : i32, 2 : i32, 2 : i32, 2 : i32]} : memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64xf32, "cuda">, memref<64xf32, "cuda">
+    byre.compute @ConvBackwardFilterOp_f16f16_f16(%0, %3, %1) {batch_group_count = 1 : i64, device = "cuda", feature_group_count = 1 : i64, input_layout = "NCHW", kernel_layout = "NCHW", memory_effects = [1 : i32, 1 : i32, 2 : i32], output_layout = "NCHW", padding = dense<3> : tensor<4xi64>, window_strides = dense<2> : tensor<2xi64>} : memref<4x3x224x224xf16, "cuda">, memref<4x64x112x112xf16, "cuda">, memref<64x3x7x7xf16, "cuda">
+    %121 = "byre.alias"(%alloc) <{offset = 62978176 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<f32, "cuda">
+    %122 = "byre.alias"(%alloc) <{offset = 5447680 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<32x125xf16, "cuda">
+    %123 = "byre.alias"(%arg1) <{offset = 0 : i64}> {device = "cuda"} : (memref<4x1000xf32, "cuda">) -> memref<32x125xf32, "cuda">
+    %124 = "byre.alias"(%alloc) <{offset = 49311488 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<32xf32, "cuda">
+    byre.compute @PTXOp(%122, %123, %124) {BlockSize.x = 128 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel"} : memref<32x125xf16, "cuda">, memref<32x125xf32, "cuda">, memref<32xf32, "cuda">
+    byre.compute @PTXOp(%124, %121) {BlockSize.x = 32 : i32, BlockSize.y = 1 : i32, BlockSize.z = 1 : i32, GridSize.x = 1 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown147_kernel_0"} : memref<32xf32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%121, %arg104) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [0 : i32, 0 : i32], device = "cuda", kernel_name = "Unknown148", memory_effects = [1 : i32, 2 : i32]} : memref<f32, "cuda">, memref<f32, "cuda">
+    byre.compute @PTXOp(%1, %arg105) {BlockSize.x = 256 : i32, GridSize.x = 10 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown149", memory_effects = [1 : i32, 2 : i32]} : memref<64x3x7x7xf16, "cuda">, memref<64x3x7x7xf32, "cuda">
+    byre.compute @PTXOp(%120, %arg108) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%119, %arg111) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%118, %arg114) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%117, %arg117) {BlockSize.x = 256 : i32, GridSize.x = 36 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown150", memory_effects = [1 : i32, 2 : i32]} : memref<64x64x3x3xf16, "cuda">, memref<64x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%113, %arg120) {BlockSize.x = 256 : i32, GridSize.x = 72 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown154", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x3x3xf16, "cuda">, memref<128x64x3x3xf32, "cuda">
+    byre.compute @PTXOp(%111, %arg123) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%115, %arg126) {BlockSize.x = 256 : i32, GridSize.x = 8 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown156", memory_effects = [1 : i32, 2 : i32]} : memref<128x64x1x1xf16, "cuda">, memref<128x64x1x1xf32, "cuda">
+    byre.compute @PTXOp(%109, %arg129) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%108, %arg132) {BlockSize.x = 256 : i32, GridSize.x = 144 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown155", memory_effects = [1 : i32, 2 : i32]} : memref<128x128x3x3xf16, "cuda">, memref<128x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%14, %arg135) {BlockSize.x = 256 : i32, GridSize.x = 288 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown159", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x3x3xf16, "cuda">, memref<256x128x3x3xf32, "cuda">
+    byre.compute @PTXOp(%15, %arg138) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%106, %arg141) {BlockSize.x = 256 : i32, GridSize.x = 32 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown161", memory_effects = [1 : i32, 2 : i32]} : memref<256x128x1x1xf16, "cuda">, memref<256x128x1x1xf32, "cuda">
+    byre.compute @PTXOp(%102, %arg144) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%101, %arg147) {BlockSize.x = 256 : i32, GridSize.x = 576 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown160", memory_effects = [1 : i32, 2 : i32]} : memref<256x256x3x3xf16, "cuda">, memref<256x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%19, %arg150) {BlockSize.x = 256 : i32, GridSize.x = 1152 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown164", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x3x3xf16, "cuda">, memref<512x256x3x3xf32, "cuda">
+    byre.compute @PTXOp(%20, %arg153) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%99, %arg156) {BlockSize.x = 256 : i32, GridSize.x = 128 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown166", memory_effects = [1 : i32, 2 : i32]} : memref<512x256x1x1xf16, "cuda">, memref<512x256x1x1xf32, "cuda">
+    byre.compute @PTXOp(%21, %arg159) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    byre.compute @PTXOp(%22, %arg162) {BlockSize.x = 256 : i32, GridSize.x = 2304 : i32, arg_ranks = [4 : i32, 4 : i32], device = "cuda", kernel_name = "Unknown165", memory_effects = [1 : i32, 2 : i32]} : memref<512x512x3x3xf16, "cuda">, memref<512x512x3x3xf32, "cuda">
+    %125 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000x512xf16, "cuda">
+    byre.compute @MatmulOp_f16f16_f16(%89, %96, %125) {device = "cuda", lhs_contracting_dimension = 0 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], output_transpose, rhs_contracting_dimension = 0 : i64} : memref<4x512xf16, "cuda">, memref<4x1000xf16, "cuda">, memref<1000x512xf16, "cuda">
+    byre.compute @PTXOp(%125, %arg165) {BlockSize.x = 256 : i32, GridSize.x = 500 : i32, arg_ranks = [2 : i32, 2 : i32], device = "cuda", kernel_name = "Unknown170", memory_effects = [1 : i32, 2 : i32]} : memref<1000x512xf16, "cuda">, memref<1000x512xf32, "cuda">
+    %126 = "byre.alias"(%alloc) <{offset = 62959360 : i64}> {device = "cuda"} : (memref<76533504xi8, "cuda">) -> memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%96, %126) {BlockSize.x = 32 : i32, BlockSize.y = 2 : i32, BlockSize.z = 1 : i32, GridSize.x = 32 : i32, GridSize.y = 1 : i32, GridSize.z = 1 : i32, device = "cuda", kernel_name = "Unknown171_kernel"} : memref<4x1000xf16, "cuda">, memref<1000xf32, "cuda">
+    byre.compute @PTXOp(%126, %arg166) {BlockSize.x = 256 : i32, GridSize.x = 1 : i32, arg_ranks = [1 : i32, 1 : i32], device = "cuda", kernel_name = "Unknown172", memory_effects = [1 : i32, 2 : i32]} : memref<1000xf32, "cuda">, memref<1000xf32, "cuda">
     return
   }
 }
\ No newline at end of file